# Testing NAGphormer

## Regular cmd run

In [1]:
# !python train.py --dataset photo --batch_size 2000 --dropout 0.1 --hidden_dim 128 \
#           --hops 3  --n_heads 8 --n_layers 1 --pe_dim 10 --peak_lr 0.001  --weight_decay=1e-05 

In [2]:
# python train.py --dataset photo --batch_size 2000 --dropout 0.1 --hidden_dim 128 --hops 3  --n_heads 8 --n_layers 1 --pe_dim 10 --peak_lr 0.001  --weight_decay=1e-05

## MODEL

In [3]:
import torch
import math
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

def init_params(module, n_layers):
    if isinstance(module, nn.Linear): # init for linear layers
        module.weight.data.normal_(mean=0.0, std=0.02 / math.sqrt(n_layers))
        if module.bias is not None:
            module.bias.data.zero_()
    if isinstance(module, nn.Embedding): # init for embedding layers
        module.weight.data.normal_(mean=0.0, std=0.02)



# def gelu(x): # not actually used, for testing implementations probably
#     """
#     GELU activation
#     https://arxiv.org/abs/1606.08415
#     https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
#     https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/modeling.py
#     """
#     # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
#     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))


### Feed forward network

In [4]:
# basic ffn architecture, 2 layers with gelu intermediate activation 
# fnn_size is the intermediate dimension
# hidden_size is the common representation dim
class FeedForwardNetwork(nn.Module):
    def __init__(self, hidden_size, ffn_size, dropout_rate):
        super(FeedForwardNetwork, self).__init__()

        self.layer1 = nn.Linear(hidden_size, ffn_size)
        self.gelu = nn.GELU()
        self.layer2 = nn.Linear(ffn_size, hidden_size)

    def forward(self, x):
        x = self.layer1(x)
        x = self.gelu(x)
        x = self.layer2(x)
        return x

### Multi Head Attention

In [5]:
# Multihead Attention with dropout on the attention scores

class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, attention_dropout_rate, num_heads):
        super(MultiHeadAttention, self).__init__()

        self.num_heads = num_heads

        self.att_size = att_size = hidden_size // num_heads # hidden = att_size * heads
        self.scale = att_size ** -0.5 # SCALED dot prod att.

        self.linear_q = nn.Linear(hidden_size, num_heads * att_size)
        self.linear_k = nn.Linear(hidden_size, num_heads * att_size)
        self.linear_v = nn.Linear(hidden_size, num_heads * att_size)
        self.att_dropout = nn.Dropout(attention_dropout_rate)

        self.output_layer = nn.Linear(num_heads * att_size, hidden_size)

    def forward(self, q, k, v, attn_bias=None): # q k and v are the same input: y
        orig_q_size = q.size()

        d_k = self.att_size
        d_v = self.att_size
        batch_size = q.size(0)

        # head_i = Attention( Q(W^Q)_i , K(W^K)_i , V(W^V)_i )
        q = self.linear_q(q).view(batch_size, -1, self.num_heads, d_k) # batch hops+1 heads att_size
        k = self.linear_k(k).view(batch_size, -1, self.num_heads, d_k)
        v = self.linear_v(v).view(batch_size, -1, self.num_heads, d_v)
        
        # d_k = d_v = att_size
        q = q.transpose(1, 2)                  # [batch, heads, q_len, d_k] 
        v = v.transpose(1, 2)                  # [batch, heads, v_len, d_v] 
        k = k.transpose(1, 2).transpose(2, 3)  # [batch, heads, d_k, k_len] 

        # Scaled Dot-Product Attention.
        # Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V
        q = q * self.scale
        x = torch.matmul(q, k)  # [b, h, q_len, k_len]
        if attn_bias is not None:
            x = x + attn_bias # not really used as implemented 

        x = torch.softmax(x, dim=3)
        x = self.att_dropout(x) # dropout on the attention scores to not create reliance 
        x = x.matmul(v)  # [batch, heads, q_len, attn]

        x = x.transpose(1, 2).contiguous()  # [batch, q_len, heads, attn]
        x = x.view(batch_size, -1, self.num_heads * d_v) # concatenate heads

        x = self.output_layer(x)

        assert x.size() == orig_q_size
        return x

### Encoder Layer

In [6]:
# mixes the MSA and a FFN

class EncoderLayer(nn.Module):
    def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, num_heads):
        super(EncoderLayer, self).__init__()

        self.self_attention_norm = nn.LayerNorm(hidden_size)
        self.self_attention = MultiHeadAttention(
            hidden_size, attention_dropout_rate, num_heads)
        self.self_attention_dropout = nn.Dropout(dropout_rate)

        self.ffn_norm = nn.LayerNorm(hidden_size)
        self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate)
        self.ffn_dropout = nn.Dropout(dropout_rate)

    def forward(self, x, attn_bias=None):
        # Transformer Encoder
        y = self.self_attention_norm(x) # layer norm
        y = self.self_attention(y, y, y, attn_bias) # Multi head attention
        y = self.self_attention_dropout(y) # dropout 
        x = x + y # residual connection agg.
        # ffn defined earlier + layer norm and dropout
        y = self.ffn_norm(x) # Layer norm
        y = self.ffn(y) # ffn with dropout
        y = self.ffn_dropout(y) # additional dropout rate
        x = x + y
        return x

### Transformer model

In [7]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        hops, # number of hops
        n_class, #
        input_dim, # input dimension
        pe_dim, # positional embedding dimension
        n_layers=6, # MSA+FFN layers
        num_heads=8, # heads in the multihead
        hidden_dim=64, # total head dim (mult of num) and node representation
        ffn_dim=64, # post ff network dim?
        dropout_rate=0.0, # regularization
        attention_dropout_rate=0.1 # dropout in the attention
    ):
        super().__init__()

        self.seq_len = hops+1 
        self.pe_dim = pe_dim
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.ffn_dim = 2 * hidden_dim  # not using ffn_dim
        self.num_heads = num_heads
        
        self.n_layers = n_layers
        self.n_class = n_class

        self.dropout_rate = dropout_rate
        self.attention_dropout_rate = attention_dropout_rate

        self.att_embeddings_nope = nn.Linear(self.input_dim, self.hidden_dim) # initial embeddings

        encoders = [EncoderLayer(self.hidden_dim, self.ffn_dim, self.dropout_rate, self.attention_dropout_rate, self.num_heads)
                    for _ in range(self.n_layers)]
        self.layers = nn.ModuleList(encoders)
        self.final_ln = nn.LayerNorm(hidden_dim)

   

        self.out_proj = nn.Linear(self.hidden_dim, int(self.hidden_dim/2))

        self.attn_layer = nn.Linear(2 * self.hidden_dim, 1)

        self.Linear1 = nn.Linear(int(self.hidden_dim/2), self.n_class)

        self.scaling = nn.Parameter(torch.ones(1) * 0.5)


        self.apply(lambda module: init_params(module, n_layers=n_layers))

    def forward(self, batched_data):

        # initial embeddings
        tensor = self.att_embeddings_nope(batched_data)

        
        # transformer encoder (n_layers number of MSA+FFN layers)
        for enc_layer in self.layers:
            tensor = enc_layer(tensor)
        
        output = self.final_ln(tensor)
   
        target = output[:,0,:].unsqueeze(1).repeat(1,self.seq_len-1,1)
        split_tensor = torch.split(output, [1, self.seq_len-1], dim=1)

        node_tensor = split_tensor[0] #[batch, 1, hidden_dim]
        neighbor_tensor = split_tensor[1] #[batch, hops, hidden_dim] 

        layer_atten = self.attn_layer(torch.cat((target, neighbor_tensor), dim=2))

        layer_atten = F.softmax(layer_atten, dim=1)

        neighbor_tensor = neighbor_tensor * layer_atten

        neighbor_tensor = torch.sum(neighbor_tensor, dim=1, keepdim=True)

        output = (node_tensor + neighbor_tensor).squeeze()


        output = self.Linear1(torch.relu(self.out_proj(output)))

    
        return torch.log_softmax(output, dim=1)

## Training

In [8]:
from data import get_dataset
dgl.backend.set_preferred_backend('pytorch')

import time
import utils
import random
import argparse
import numpy as np
import torch
import torch.nn.functional as F
from early_stop import EarlyStopping, Stop_args
from model import TransformerModel
from lr import PolynomialDecayLR
import os.path
import torch.utils.data as Data
import argparse

args = argparse.Namespace()

#main parameters
#parser.add_argument('--name', type=str, default=None)
args.dataset= 'photo'
args.device= '1'
args.seed= 3407

# model parameters
args.hops = 3
args. pe_dim = 10
args.hidden_dim = 128
args.ffn_dim = 64
args.n_layers = 1
args.n_heads = 8
args.dropout = 0.1
args.attention_dropout = 0.1

# training parameters
args.batch_size = 2000
args.epochs = 2000
args.tot_updates = 1000
args.warmup_updates = 400
args.peak_lr = 0.001
args.end_lr = 0.0001
args.weight_decay = 0.00001
args.patience = 50


device = 'cpu'

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [26]:
# Load and pre-process data
adj, features, labels, idx_train, idx_val, idx_test = get_dataset(args.dataset, args.pe_dim)


processed_features = utils.re_features(adj, features, args.hops)  # return (N, hops+1, d)

In [45]:
adj

tensor(indices=tensor([[   0,    0,    0,  ..., 7533, 7533, 7534],
                       [   1,    2,    3,  ..., 7532, 7534, 7533]]),
       values=tensor([0.3333, 0.3333, 0.3333,  ..., 0.0630, 0.1667, 0.1667]),
       size=(7650, 7650), nnz=238163, layout=torch.sparse_coo)

In [9]:
labels = labels.to(device) 

batch_data_train = Data.TensorDataset(processed_features[idx_train], labels[idx_train])
batch_data_val = Data.TensorDataset(processed_features[idx_val], labels[idx_val])
batch_data_test = Data.TensorDataset(processed_features[idx_test], labels[idx_test])


train_data_loader = Data.DataLoader(batch_data_train, batch_size=args.batch_size, shuffle = True)
val_data_loader = Data.DataLoader(batch_data_val, batch_size=args.batch_size, shuffle = True)
test_data_loader = Data.DataLoader(batch_data_test, batch_size=args.batch_size, shuffle = True)


# model configuration
model = TransformerModel(hops=args.hops, 
                        n_class=labels.max().item() + 1, 
                        input_dim=features.shape[1], 
                        pe_dim = args.pe_dim,
                        n_layers=args.n_layers,
                        num_heads=args.n_heads,
                        hidden_dim=args.hidden_dim,
                        ffn_dim=args.ffn_dim,
                        dropout_rate=args.dropout,
                        attention_dropout_rate=args.attention_dropout).to(device)

print(model)
print('total params:', sum(p.numel() for p in model.parameters()))

optimizer = torch.optim.AdamW(model.parameters(), lr=args.peak_lr, weight_decay=args.weight_decay)
lr_scheduler = PolynomialDecayLR(
                optimizer,
                warmup_updates=args.warmup_updates,
                tot_updates=args.tot_updates,
                lr=args.peak_lr,
                end_lr=args.end_lr,
                power=1.0,
            )

Downloading C:\Users\surfnick\.dgl\amazon_co_buy_photo.zip from https://data.dgl.ai/dataset/amazon_co_buy_photo.zip...
Extracting file to C:\Users\surfnick\.dgl\amazon_co_buy_photo_b75d805d


  lap_pos_enc = torch.from_numpy(EigVec[:,1:pos_enc_dim+1]).float()


TransformerModel(
  (att_embeddings_nope): Linear(in_features=755, out_features=128, bias=True)
  (layers): ModuleList(
    (0): EncoderLayer(
      (self_attention_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttention(
        (linear_q): Linear(in_features=128, out_features=128, bias=True)
        (linear_k): Linear(in_features=128, out_features=128, bias=True)
        (linear_v): Linear(in_features=128, out_features=128, bias=True)
        (att_dropout): Dropout(p=0.1, inplace=False)
        (output_layer): Linear(in_features=128, out_features=128, bias=True)
      )
      (self_attention_dropout): Dropout(p=0.1, inplace=False)
      (ffn_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ffn): FeedForwardNetwork(
        (layer1): Linear(in_features=128, out_features=256, bias=True)
        (gelu): GELU(approximate='none')
        (layer2): Linear(in_features=256, out_features=128, bias=True)
      )
      (ffn_drop

In [10]:
def train_valid_epoch(epoch):
    
    model.train()
    loss_train_b = 0
    acc_train_b = 0
    for _, item in enumerate(train_data_loader):
        
        nodes_features = item[0].to(device)
        labels = item[1].to(device)

        optimizer.zero_grad()
        output = model(nodes_features)
        loss_train = F.nll_loss(output, labels)
        loss_train.backward()
        optimizer.step()
        lr_scheduler.step()

        loss_train_b += loss_train.item()
        acc_train = utils.accuracy_batch(output, labels)
        acc_train_b += acc_train.item()
        
    
    model.eval()
    loss_val = 0
    acc_val = 0
    for _, item in enumerate(val_data_loader):
        nodes_features = item[0].to(device)
        labels = item[1].to(device)



        output = model(nodes_features)
        loss_val += F.nll_loss(output, labels).item()
        acc_val += utils.accuracy_batch(output, labels).item()
        

    print('Epoch: {:04d}'.format(epoch+1),
        'loss_train: {:.4f}'.format(loss_train_b),
        'acc_train: {:.4f}'.format(acc_train_b/len(idx_train)),
        'loss_val: {:.4f}'.format(loss_val),
        'acc_val: {:.4f}'.format(acc_val/len(idx_val)))

    return loss_val, acc_val

In [11]:
def test():

    loss_test = 0
    acc_test = 0
    for _, item in enumerate(test_data_loader):
        nodes_features = item[0].to(device)
        labels = item[1].to(device)


        model.eval()

        output = model(nodes_features)
        loss_test += F.nll_loss(output, labels).item()
        acc_test += utils.accuracy_batch(output, labels).item()

    print("Test set results:",
        "loss= {:.4f}".format(loss_test),
        "accuracy= {:.4f}".format(acc_test/len(idx_test)))

In [12]:
t_total = time.time()
stopping_args = Stop_args(patience=args.patience, max_epochs=args.epochs)
early_stopping = EarlyStopping(model, **stopping_args)
for epoch in range(args.epochs):
    loss_val, acc_val = train_valid_epoch(epoch)
    if early_stopping.check([acc_val, loss_val], epoch):
        break

print("Optimization Finished!")
print("Train cost: {:.4f}s".format(time.time() - t_total))
# Restore best model
print('Loading {}th epoch'.format(early_stopping.best_epoch+1))
model.load_state_dict(early_stopping.best_state)

test()

Epoch: 0001 loss_train: 6.2147 acc_train: 0.1562 loss_val: 2.0707 acc_val: 0.1745
Epoch: 0002 loss_train: 6.2074 acc_train: 0.1788 loss_val: 2.0661 acc_val: 0.1987
Epoch: 0003 loss_train: 6.1904 acc_train: 0.2124 loss_val: 2.0592 acc_val: 0.2307
Epoch: 0004 loss_train: 6.1689 acc_train: 0.2309 loss_val: 2.0506 acc_val: 0.2418
Epoch: 0005 loss_train: 6.1425 acc_train: 0.2481 loss_val: 2.0408 acc_val: 0.2503
Epoch: 0006 loss_train: 6.1067 acc_train: 0.2581 loss_val: 2.0300 acc_val: 0.2582
Epoch: 0007 loss_train: 6.0799 acc_train: 0.2583 loss_val: 2.0181 acc_val: 0.2542
Epoch: 0008 loss_train: 6.0371 acc_train: 0.2577 loss_val: 2.0052 acc_val: 0.2536
Epoch: 0009 loss_train: 5.9975 acc_train: 0.2551 loss_val: 1.9915 acc_val: 0.2529
Epoch: 0010 loss_train: 5.9573 acc_train: 0.2546 loss_val: 1.9776 acc_val: 0.2529
Epoch: 0011 loss_train: 5.9198 acc_train: 0.2544 loss_val: 1.9635 acc_val: 0.2536
Epoch: 0012 loss_train: 5.8695 acc_train: 0.2562 loss_val: 1.9477 acc_val: 0.2608
Epoch: 0013 loss

Epoch: 0101 loss_train: 0.0669 acc_train: 0.9956 loss_val: 0.2232 acc_val: 0.9510
Epoch: 0102 loss_train: 0.0536 acc_train: 0.9970 loss_val: 0.2192 acc_val: 0.9490
Epoch: 0103 loss_train: 0.0505 acc_train: 0.9972 loss_val: 0.2210 acc_val: 0.9523
Epoch: 0104 loss_train: 0.0472 acc_train: 0.9980 loss_val: 0.2266 acc_val: 0.9516
Epoch: 0105 loss_train: 0.0503 acc_train: 0.9976 loss_val: 0.2219 acc_val: 0.9523
Epoch: 0106 loss_train: 0.0566 acc_train: 0.9976 loss_val: 0.2219 acc_val: 0.9536
Epoch: 0107 loss_train: 0.0535 acc_train: 0.9980 loss_val: 0.2294 acc_val: 0.9523
Epoch: 0108 loss_train: 0.0430 acc_train: 0.9985 loss_val: 0.2233 acc_val: 0.9542
Epoch: 0109 loss_train: 0.0544 acc_train: 0.9985 loss_val: 0.2281 acc_val: 0.9549
Epoch: 0110 loss_train: 0.0341 acc_train: 0.9985 loss_val: 0.2383 acc_val: 0.9536
Epoch: 0111 loss_train: 0.0431 acc_train: 0.9987 loss_val: 0.2297 acc_val: 0.9549
Epoch: 0112 loss_train: 0.0363 acc_train: 0.9987 loss_val: 0.2320 acc_val: 0.9536
Epoch: 0113 loss