# Testing NAGphormer

## Regular cmd run

In [1]:
# !python train.py --dataset photo --batch_size 2000 --dropout 0.1 --hidden_dim 128 \
#           --hops 3  --n_heads 8 --n_layers 1 --pe_dim 10 --peak_lr 0.001  --weight_decay=1e-05 

In [2]:
# python train.py --dataset photo --batch_size 2000 --dropout 0.1 --hidden_dim 128 --hops 3  --n_heads 8 --n_layers 1 --pe_dim 10 --peak_lr 0.001  --weight_decay=1e-05

## MODEL

In [3]:
import torch
import math
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

def init_params(module, n_layers):
    if isinstance(module, nn.Linear): # init for linear layers
        module.weight.data.normal_(mean=0.0, std=0.02 / math.sqrt(n_layers))
        if module.bias is not None:
            module.bias.data.zero_()
    if isinstance(module, nn.Embedding): # init for embedding layers
        module.weight.data.normal_(mean=0.0, std=0.02)



# def gelu(x): # not actually used, for testing implementations probably
#     """
#     GELU activation
#     https://arxiv.org/abs/1606.08415
#     https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
#     https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/modeling.py
#     """
#     # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
#     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))


### Feed forward network

In [4]:
# basic ffn architecture, 2 layers with gelu intermediate activation 
# fnn_size is the intermediate dimension
# hidden_size is the common representation dim
class FeedForwardNetwork(nn.Module):
    def __init__(self, hidden_size, ffn_size, dropout_rate):
        super(FeedForwardNetwork, self).__init__()

        self.layer1 = nn.Linear(hidden_size, ffn_size)
        self.gelu = nn.GELU()
        self.layer2 = nn.Linear(ffn_size, hidden_size)

    def forward(self, x):
        x = self.layer1(x)
        x = self.gelu(x)
        x = self.layer2(x)
        return x

### Multi Head Attention

In [5]:
# Multihead Attention with dropout on the attention scores

class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, attention_dropout_rate, num_heads):
        super(MultiHeadAttention, self).__init__()

        self.num_heads = num_heads

        self.att_size = att_size = hidden_size // num_heads # hidden = att_size * heads
        self.scale = att_size ** -0.5 # SCALED dot prod att.

        self.linear_q = nn.Linear(hidden_size, num_heads * att_size)
        self.linear_k = nn.Linear(hidden_size, num_heads * att_size)
        self.linear_v = nn.Linear(hidden_size, num_heads * att_size)
        self.att_dropout = nn.Dropout(attention_dropout_rate)

        self.output_layer = nn.Linear(num_heads * att_size, hidden_size)

    def forward(self, q, k, v, attn_bias=None): # q k and v are the same input: y
        orig_q_size = q.size()

        d_k = self.att_size
        d_v = self.att_size
        batch_size = q.size(0)

        # head_i = Attention( Q(W^Q)_i , K(W^K)_i , V(W^V)_i )
        q = self.linear_q(q).view(batch_size, -1, self.num_heads, d_k) # batch hops+1 heads att_size
        k = self.linear_k(k).view(batch_size, -1, self.num_heads, d_k)
        v = self.linear_v(v).view(batch_size, -1, self.num_heads, d_v)
        
        # d_k = d_v = att_size
        q = q.transpose(1, 2)                  # [batch, heads, q_len, d_k] 
        v = v.transpose(1, 2)                  # [batch, heads, v_len, d_v] 
        k = k.transpose(1, 2).transpose(2, 3)  # [batch, heads, d_k, k_len] 

        # Scaled Dot-Product Attention.
        # Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V
        q = q * self.scale
        x = torch.matmul(q, k)  # [b, h, q_len, k_len]
        if attn_bias is not None:
            x = x + attn_bias # not really used as implemented 

        x = torch.softmax(x, dim=3)
        x = self.att_dropout(x) # dropout on the attention scores to not create reliance 
        x = x.matmul(v)  # [batch, heads, q_len, attn]

        x = x.transpose(1, 2).contiguous()  # [batch, q_len, heads, attn]
        x = x.view(batch_size, -1, self.num_heads * d_v) # concatenate heads

        x = self.output_layer(x)

        assert x.size() == orig_q_size
        return x

### Encoder Layer

In [6]:
# mixes the MSA and a FFN

class EncoderLayer(nn.Module):
    def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, num_heads):
        super(EncoderLayer, self).__init__()

        self.self_attention_norm = nn.LayerNorm(hidden_size)
        self.self_attention = MultiHeadAttention(
            hidden_size, attention_dropout_rate, num_heads)
        self.self_attention_dropout = nn.Dropout(dropout_rate)

        self.ffn_norm = nn.LayerNorm(hidden_size)
        self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate)
        self.ffn_dropout = nn.Dropout(dropout_rate)

    def forward(self, x, attn_bias=None):
        # Transformer Encoder
        y = self.self_attention_norm(x) # layer norm
        y = self.self_attention(y, y, y, attn_bias) # Multi head attention
        y = self.self_attention_dropout(y) # dropout 
        x = x + y # residual connection agg.
        # ffn defined earlier + layer norm and dropout
        y = self.ffn_norm(x) # Layer norm
        y = self.ffn(y) # ffn with dropout
        y = self.ffn_dropout(y) # additional dropout rate
        x = x + y
        return x

### Transformer model

In [7]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        hops, # number of hops
        n_class, #
        input_dim, # input dimension
        pe_dim, # positional embedding dimension
        n_layers=6, # MSA+FFN layers
        num_heads=8, # heads in the multihead
        hidden_dim=64, # total head dim (mult of num) and node representation
        ffn_dim=64, # post ff network dim?
        dropout_rate=0.0, # regularization
        attention_dropout_rate=0.1 # dropout in the attention
    ):
        super().__init__()

        self.seq_len = hops+1 
        self.pe_dim = pe_dim
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.ffn_dim = 2 * hidden_dim  # not using ffn_dim
        self.num_heads = num_heads
        
        self.n_layers = n_layers
        self.n_class = n_class

        self.dropout_rate = dropout_rate
        self.attention_dropout_rate = attention_dropout_rate

        self.att_embeddings_nope = nn.Linear(self.input_dim, self.hidden_dim) # initial embeddings

        encoders = [EncoderLayer(self.hidden_dim, self.ffn_dim, self.dropout_rate, self.attention_dropout_rate, self.num_heads)
                    for _ in range(self.n_layers)]
        self.layers = nn.ModuleList(encoders)
        self.final_ln = nn.LayerNorm(hidden_dim)

   

        self.out_proj = nn.Linear(self.hidden_dim, int(self.hidden_dim/2))

        self.attn_layer = nn.Linear(2 * self.hidden_dim, 1)

        self.Linear1 = nn.Linear(int(self.hidden_dim/2), self.n_class)

        self.scaling = nn.Parameter(torch.ones(1) * 0.5)


        self.apply(lambda module: init_params(module, n_layers=n_layers))

    def forward(self, batched_data):

        # initial embeddings
        tensor = self.att_embeddings_nope(batched_data)

        
        # transformer encoder (n_layers number of MSA+FFN layers)
        for enc_layer in self.layers:
            tensor = enc_layer(tensor)
        
        output = self.final_ln(tensor)
   
        target = output[:,0,:].unsqueeze(1).repeat(1,self.seq_len-1,1)
        split_tensor = torch.split(output, [1, self.seq_len-1], dim=1)

        node_tensor = split_tensor[0] #[batch, 1, hidden_dim]
        neighbor_tensor = split_tensor[1] #[batch, hops, hidden_dim] 

        layer_atten = self.attn_layer(torch.cat((target, neighbor_tensor), dim=2))

        layer_atten = F.softmax(layer_atten, dim=1)

        neighbor_tensor = neighbor_tensor * layer_atten

        neighbor_tensor = torch.sum(neighbor_tensor, dim=1, keepdim=True)

        output = (node_tensor + neighbor_tensor).squeeze()


        output = self.Linear1(torch.relu(self.out_proj(output)))

    
        return torch.log_softmax(output, dim=1)

## Training

In [8]:
from data import get_dataset
import time
import utils
import random
import argparse
import numpy as np
import torch
import torch.nn.functional as F
from early_stop import EarlyStopping, Stop_args
from model import TransformerModel
from lr import PolynomialDecayLR
import os.path
import torch.utils.data as Data
import argparse

args = argparse.Namespace()

#main parameters
#parser.add_argument('--name', type=str, default=None)
args.dataset= 'photo'
args.device= '1'
args.seed= 3407

# model parameters
args.hops = 3
args. pe_dim = 10
args.hidden_dim = 128
args.ffn_dim = 64
args.n_layers = 1
args.n_heads = 8
args.dropout = 0.1
args.attention_dropout = 0.1

# training parameters
args.batch_size = 2000
args.epochs = 2000
args.tot_updates = 1000
args.warmup_updates = 400
args.peak_lr = 0.001
args.end_lr = 0.0001
args.weight_decay = 0.00001
args.patience = 50


device = 'cpu'

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)

In [57]:
# Load and pre-process data
#adj, features, labels, idx_train, idx_val, idx_test = get_dataset(args.dataset, args.pe_dim)


processed_features = utils.re_features(adj, features, args.hops)  # return (N, hops+1, d)

#### Alternative dataset begins here

In [56]:
args1 = argparse.Namespace()

#main parameters
#parser.add_argument('--name', type=str, default=None)
args1.path = 'path'
args1.data = 'cocitation'
args1.dataset = 'cora'
args1.isolated = True
args1.split = 1 # does not matter (10 splits same data)
args1.train_rate = 0.6
args1.val_rate = 0.2 # new, for testing
args1.gpu = 0
args1.cuda = False
args1.seed = 42069
args1.depth = 2
args1.dropout = 0.5
args1.epochs = 20
args1.embed_dim = 128
args1.self_loop = True
args1.rate = 0.01
args1.decay = 0.0005
args1.batch_size = 256

from data1 import data
dataset, train, test = data.load(args1)

import itertools

indices = torch.empty((0,2)) 
for key, values in dataset['hypergraph'].items():
    comb=itertools.combinations(values,2)
    indices = torch.cat([indices, torch.tensor(list(comb))], dim = 0)
indices = torch.unique(indices.T, dim=1)
values = torch.ones(indices.shape[1])
adj = torch.sparse_coo_tensor(indices, values)

args1.N = adj.shape[0]

features = torch.tensor(dataset['features'])
labels = torch.tensor(dataset['labels']).argmax(1)

rand_indices = np.random.permutation(args1.N)
cut1 = int(args1.N * args1.train_rate)
cut2 = int(args1.N * (args1.train_rate + args1.val_rate))
idx_train = list(rand_indices[:cut1])
idx_val = list(rand_indices[cut1:cut2])
idx_test = list(rand_indices[cut2:])

number of hyperedges is 1579


#### Alternative dataset ends here

In [58]:
labels = labels.to(device) 

batch_data_train = Data.TensorDataset(processed_features[idx_train], labels[idx_train])
batch_data_val = Data.TensorDataset(processed_features[idx_val], labels[idx_val])
batch_data_test = Data.TensorDataset(processed_features[idx_test], labels[idx_test])


train_data_loader = Data.DataLoader(batch_data_train, batch_size=args.batch_size, shuffle = True)
val_data_loader = Data.DataLoader(batch_data_val, batch_size=args.batch_size, shuffle = True)
test_data_loader = Data.DataLoader(batch_data_test, batch_size=args.batch_size, shuffle = True)


# model configuration
model = TransformerModel(hops=args.hops, 
                        n_class=labels.max().item() + 1, 
                        input_dim=features.shape[1], 
                        pe_dim = args.pe_dim,
                        n_layers=args.n_layers,
                        num_heads=args.n_heads,
                        hidden_dim=args.hidden_dim,
                        ffn_dim=args.ffn_dim,
                        dropout_rate=args.dropout,
                        attention_dropout_rate=args.attention_dropout).to(device)

print(model)
print('total params:', sum(p.numel() for p in model.parameters()))

optimizer = torch.optim.AdamW(model.parameters(), lr=args.peak_lr, weight_decay=args.weight_decay)
lr_scheduler = PolynomialDecayLR(
                optimizer,
                warmup_updates=args.warmup_updates,
                tot_updates=args.tot_updates,
                lr=args.peak_lr,
                end_lr=args.end_lr,
                power=1.0,
            )

TransformerModel(
  (att_embeddings_nope): Linear(in_features=1433, out_features=128, bias=True)
  (layers): ModuleList(
    (0): EncoderLayer(
      (self_attention_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttention(
        (linear_q): Linear(in_features=128, out_features=128, bias=True)
        (linear_k): Linear(in_features=128, out_features=128, bias=True)
        (linear_v): Linear(in_features=128, out_features=128, bias=True)
        (att_dropout): Dropout(p=0.1, inplace=False)
        (output_layer): Linear(in_features=128, out_features=128, bias=True)
      )
      (self_attention_dropout): Dropout(p=0.1, inplace=False)
      (ffn_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ffn): FeedForwardNetwork(
        (layer1): Linear(in_features=128, out_features=256, bias=True)
        (gelu): GELU(approximate='none')
        (layer2): Linear(in_features=256, out_features=128, bias=True)
      )
      (ffn_dro

In [59]:
def train_valid_epoch(epoch):
    
    model.train()
    loss_train_b = 0
    acc_train_b = 0
    for _, item in enumerate(train_data_loader):
        
        nodes_features = item[0].to(device)
        labels = item[1].to(device)

        optimizer.zero_grad()
        output = model(nodes_features)
        loss_train = F.nll_loss(output, labels)
        loss_train.backward()
        optimizer.step()
        lr_scheduler.step()

        loss_train_b += loss_train.item()
        acc_train = utils.accuracy_batch(output, labels)
        acc_train_b += acc_train.item()
        
    
    model.eval()
    loss_val = 0
    acc_val = 0
    for _, item in enumerate(val_data_loader):
        nodes_features = item[0].to(device)
        labels = item[1].to(device)



        output = model(nodes_features)
        loss_val += F.nll_loss(output, labels).item()
        acc_val += utils.accuracy_batch(output, labels).item()
        

    print('Epoch: {:04d}'.format(epoch+1),
        'loss_train: {:.4f}'.format(loss_train_b),
        'acc_train: {:.4f}'.format(acc_train_b/len(idx_train)),
        'loss_val: {:.4f}'.format(loss_val),
        'acc_val: {:.4f}'.format(acc_val/len(idx_val)))

    return loss_val, acc_val

In [60]:
def test():

    loss_test = 0
    acc_test = 0
    for _, item in enumerate(test_data_loader):
        nodes_features = item[0].to(device)
        labels = item[1].to(device)


        model.eval()

        output = model(nodes_features)
        loss_test += F.nll_loss(output, labels).item()
        acc_test += utils.accuracy_batch(output, labels).item()

    print("Test set results:",
        "loss= {:.4f}".format(loss_test),
        "accuracy= {:.4f}".format(acc_test/len(idx_test)))

In [61]:
t_total = time.time()
stopping_args = Stop_args(patience=args.patience, max_epochs=args.epochs)
early_stopping = EarlyStopping(model, **stopping_args)
for epoch in range(args.epochs):
    loss_val, acc_val = train_valid_epoch(epoch)
    if early_stopping.check([acc_val, loss_val], epoch):
        break

print("Optimization Finished!")
print("Train cost: {:.4f}s".format(time.time() - t_total))
# Restore best model
print('Loading {}th epoch'.format(early_stopping.best_epoch+1))
model.load_state_dict(early_stopping.best_state)

test()

Epoch: 0001 loss_train: 1.9427 acc_train: 0.1847 loss_val: 1.9456 acc_val: 0.1624
Epoch: 0002 loss_train: 1.9427 acc_train: 0.1780 loss_val: 1.9452 acc_val: 0.1661
Epoch: 0003 loss_train: 1.9422 acc_train: 0.1866 loss_val: 1.9445 acc_val: 0.1697
Epoch: 0004 loss_train: 1.9415 acc_train: 0.1927 loss_val: 1.9436 acc_val: 0.1808
Epoch: 0005 loss_train: 1.9403 acc_train: 0.1940 loss_val: 1.9424 acc_val: 0.1974
Epoch: 0006 loss_train: 1.9391 acc_train: 0.2161 loss_val: 1.9411 acc_val: 0.2177
Epoch: 0007 loss_train: 1.9378 acc_train: 0.2309 loss_val: 1.9395 acc_val: 0.2306
Epoch: 0008 loss_train: 1.9362 acc_train: 0.2395 loss_val: 1.9377 acc_val: 0.2565
Epoch: 0009 loss_train: 1.9345 acc_train: 0.2617 loss_val: 1.9358 acc_val: 0.2694
Epoch: 0010 loss_train: 1.9324 acc_train: 0.2919 loss_val: 1.9336 acc_val: 0.2860
Epoch: 0011 loss_train: 1.9299 acc_train: 0.3011 loss_val: 1.9313 acc_val: 0.3063
Epoch: 0012 loss_train: 1.9272 acc_train: 0.3257 loss_val: 1.9288 acc_val: 0.3155
Epoch: 0013 loss

Epoch: 0101 loss_train: 0.3765 acc_train: 0.9243 loss_val: 0.9392 acc_val: 0.7066
Epoch: 0102 loss_train: 0.3597 acc_train: 0.9286 loss_val: 0.9389 acc_val: 0.7140
Epoch: 0103 loss_train: 0.3437 acc_train: 0.9280 loss_val: 0.9379 acc_val: 0.7122
Epoch: 0104 loss_train: 0.3281 acc_train: 0.9304 loss_val: 0.9347 acc_val: 0.7085
Epoch: 0105 loss_train: 0.3140 acc_train: 0.9329 loss_val: 0.9406 acc_val: 0.7103
Epoch: 0106 loss_train: 0.2989 acc_train: 0.9360 loss_val: 0.9425 acc_val: 0.7140
Epoch: 0107 loss_train: 0.2864 acc_train: 0.9353 loss_val: 0.9370 acc_val: 0.7103
Epoch: 0108 loss_train: 0.2728 acc_train: 0.9397 loss_val: 0.9403 acc_val: 0.7085
Epoch: 0109 loss_train: 0.2599 acc_train: 0.9464 loss_val: 0.9492 acc_val: 0.7066
Epoch: 0110 loss_train: 0.2474 acc_train: 0.9489 loss_val: 0.9499 acc_val: 0.7085
Epoch: 0111 loss_train: 0.2365 acc_train: 0.9575 loss_val: 0.9492 acc_val: 0.7085
Epoch: 0112 loss_train: 0.2249 acc_train: 0.9612 loss_val: 0.9578 acc_val: 0.7048
Epoch: 0113 loss