## Notebook to evaluate molecule graph regression models (individual, fused or ensemble)

In [3]:
"""
    IMPORTING LIBS
"""
import dgl

import numpy as np
import os
import socket
import time
import random
import glob
import argparse, json
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torch.utils.data import DataLoader

from tensorboardX import SummaryWriter
from tqdm import tqdm

In [11]:
import train.train_molecules_graph_regression as train

In [4]:
# """
#     AUTORELOAD IPYTHON EXTENSION FOR RELOADING IMPORTED MODULES
# """

def in_ipynb():
    try:
        cfg = get_ipython().config 
        return True
    except NameError:
        return False
    
notebook_mode = in_ipynb()
print(notebook_mode)

if notebook_mode == True:
    %load_ext autoreload
    %autoreload 2

True


Copy of params for testing purposes -> change to read from txt file from output

In [1]:
import os
os.chdir('../') # go to root folder of the project
print(os.getcwd())

/home/ehoskovec/Workspace/ETH/master/sem3/deepLearning/Project/gnn_fusion/gnn_benchmarking


In [2]:
"""
    IMPORTING CUSTOM MODULES/METHODS
"""
from nets.molecules_graph_regression.load_net import gnn_model # import all GNNS
from data.data import LoadData # import dataset

Using backend: pytorch


In [5]:
# """
#     USER CONTROLS
# """
if notebook_mode == True:
    
    #MODEL_NAME = '3WLGNN'
    #MODEL_NAME = 'RingGNN'
    #MODEL_NAME = 'GatedGCN'
    #MODEL_NAME = 'MoNet'
    MODEL_NAME = 'GCN'
    # MODEL_NAME = 'GAT'
    # MODEL_NAME = 'GraphSage'
    # MODEL_NAME = 'DiffPool'
    # MODEL_NAME = 'MLP'
    # MODEL_NAME = 'GIN'

    DATASET_NAME = 'ZINC'

    out_dir = 'out/molecules_graph_regression/'
    root_log_dir = out_dir + 'logs/' + MODEL_NAME + "_" + DATASET_NAME + "_" + time.strftime('%Hh%Mm%Ss_on_%b_%d_%Y')
    root_ckpt_dir = out_dir + 'checkpoints/' + MODEL_NAME + "_" + DATASET_NAME + "_" + time.strftime('%Hh%Mm%Ss_on_%b_%d_%Y')

    print("[I] Loading data (notebook) ...")
    dataset = LoadData(DATASET_NAME)
    trainset, valset, testset = dataset.train, dataset.val, dataset.test
    print("[I] Finished loading.")

[I] Loading data (notebook) ...
[I] Loading dataset ZINC...
train, test, val sizes : 10000 1000 1000
[I] Finished loading.
[I] Data load time: 14.1761s
[I] Finished loading.


In [6]:
MODEL_NAME = 'GCN'
use_gpu = False; gpu_id = -1; device = None # CPU

In [7]:
# """
#     PARAMETERS
# """
if notebook_mode == True:

    n_heads = -1
    edge_feat = False
    pseudo_dim_MoNet = -1
    kernel = -1
    gnn_per_block = -1
    embedding_dim = -1
    pool_ratio = -1
    n_mlp_GIN = -1
    gated = False
    self_loop = False
    #self_loop = True
    max_time = 1
    pos_enc = True
    #pos_enc = False
    pos_enc_dim = 8
    

    if MODEL_NAME == 'GatedGCN':
        seed=41; epochs=1000; batch_size=5; init_lr=5e-5; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; hidden_dim=70; out_dim=hidden_dim; dropout=0.0; readout='mean'

    if MODEL_NAME == 'GCN':
        seed=68; epochs=5; batch_size=5; init_lr=5e-5; lr_reduce_factor=0.3; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; hidden_dim=145; out_dim=hidden_dim; dropout=0.0; readout='mean'

    if MODEL_NAME == 'GAT':
        seed=41; epochs=1000; batch_size=50; init_lr=5e-5; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; n_heads=8; hidden_dim=18; out_dim=n_heads*hidden_dim; dropout=0.0; readout='mean'
        print('True hidden dim:',out_dim)

    if MODEL_NAME == 'GraphSage':
        seed=41; epochs=1000; batch_size=50; init_lr=5e-5; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; hidden_dim=108; out_dim=hidden_dim; dropout=0.0; readout='mean'

    if MODEL_NAME == 'MLP':
        seed=41; epochs=1000; batch_size=50; init_lr=5e-4; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        gated=False; # MEAN
        L=4; hidden_dim=150; out_dim=hidden_dim; dropout=0.0; readout='mean'
        gated=True; # GATED
        L=4; hidden_dim=135; out_dim=hidden_dim; dropout=0.0; readout='mean'
        
    if MODEL_NAME == 'DiffPool':
        seed=41; epochs=1000; batch_size=50; init_lr=5e-4; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; hidden_dim=56; out_dim=hidden_dim; dropout=0.0; readout='mean'
        n_heads=8; gnn_per_block=3; embedding_dim=hidden_dim; batch_size=128; pool_ratio=0.15

    if MODEL_NAME == 'GIN':
        seed=41; epochs=1000; batch_size=50; init_lr=5e-4; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; hidden_dim=110; out_dim=hidden_dim; dropout=0.0; readout='mean'
        n_mlp_GIN = 2; learn_eps_GIN=True; neighbor_aggr_GIN='sum'

    if MODEL_NAME == 'MoNet':
        seed=41; epochs=1000; batch_size=50; init_lr=5e-4; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        L=4; hidden_dim=90; out_dim=hidden_dim; dropout=0.0; readout='mean'
        pseudo_dim_MoNet=2; kernel=3;
    
    if MODEL_NAME == 'RingGNN':
        seed=41; epochs=1000; batch_size=1; init_lr=5e-5; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        #L=4; hidden_dim=145; out_dim=hidden_dim; dropout=0.0; readout='mean'
        L=4; hidden_dim=22; out_dim=hidden_dim; dropout=0.0; edge_feat=False
    
    if MODEL_NAME == '3WLGNN':
        seed=41; epochs=1000; batch_size=1; init_lr=5e-5; lr_reduce_factor=0.5; lr_schedule_patience=25; min_lr = 1e-6; weight_decay=0
        #L=4; hidden_dim=145; out_dim=hidden_dim; dropout=0.0; readout='mean'
        L=3; hidden_dim=79; out_dim=hidden_dim; dropout=0.0; edge_feat=False
        
        
    # generic new_params
    net_params = {}
    net_params['device'] = device
    net_params['num_atom_type'] = dataset.num_atom_type
    net_params['num_bond_type'] = dataset.num_bond_type
    net_params['residual'] = True
    net_params['hidden_dim'] = hidden_dim
    net_params['out_dim'] = out_dim
    net_params['n_heads'] = n_heads
    net_params['L'] = L  # min L should be 2
    net_params['readout'] = "sum"
    net_params['layer_norm'] = True
    net_params['batch_norm'] = True
    net_params['in_feat_dropout'] = 0.0
    net_params['dropout'] = 0.0
    net_params['edge_feat'] = edge_feat
    net_params['self_loop'] = self_loop

    # for MLPNet 
    net_params['gated'] = gated  
    
    # specific for MoNet
    net_params['pseudo_dim_MoNet'] = pseudo_dim_MoNet
    net_params['kernel'] = kernel
    
    # specific for GIN
    net_params['n_mlp_GIN'] = n_mlp_GIN
    net_params['learn_eps_GIN'] = True
    net_params['neighbor_aggr_GIN'] = 'sum'
    
    # specific for graphsage
    net_params['sage_aggregator'] = 'meanpool'    

    # specific for diffpoolnet
    net_params['data_mode'] = 'default'
    net_params['gnn_per_block'] = gnn_per_block
    net_params['embedding_dim'] = embedding_dim     
    net_params['pool_ratio'] = pool_ratio
    net_params['linkpred'] = True
    net_params['num_pool'] = 1
    net_params['cat'] = False
    net_params['batch_size'] = batch_size   
    
    # specific for RingGNN
    net_params['radius'] = 2
    num_nodes = [trainset[i][0].number_of_nodes() for i in range(len(trainset))]
    net_params['avg_node_num'] = int(np.ceil(np.mean(num_nodes)))
    
    # specific for 3WLGNN
    net_params['depth_of_mlp'] = 2

    # calculate assignment dimension: pool_ratio * largest graph's maximum
    # number of nodes  in the dataset
    max_num_node = max(num_nodes)
    net_params['assign_dim'] = int(max_num_node * net_params['pool_ratio']) * net_params['batch_size']
    
    # specific for pos_enc_dim
    net_params['pos_enc'] = pos_enc
    net_params['pos_enc_dim'] = pos_enc_dim
    

In [8]:
"""
    IMPORTING MODELS
"""
#file_name = "gnn_benchmarking/out/molecules_graph_regression/checkpoints/GCN_ZINC_GPU-1_15h26m48s_on_Nov_22_2023"

model = gnn_model(MODEL_NAME, net_params)
model.load_state_dict(torch.load("out/molecules_graph_regression/checkpoints/GCN_ZINC_GPU-1_15h26m48s_on_Nov_22_2023/RUN_/final.pkl"))
model.eval()

GCNNet(
  (in_feat_dropout): Dropout(p=0.0, inplace=False)
  (embedding_h): Embedding(28, 145)
  (layers): ModuleList(
    (0): GCNLayer(in_channels=145, out_channels=145, residual=True)
    (1): GCNLayer(in_channels=145, out_channels=145, residual=True)
    (2): GCNLayer(in_channels=145, out_channels=145, residual=True)
    (3): GCNLayer(in_channels=145, out_channels=145, residual=True)
  )
  (MLP_layer): MLPReadout(
    (FC_layers): ModuleList(
      (0): Linear(in_features=145, out_features=72, bias=True)
      (1): Linear(in_features=72, out_features=36, bias=True)
      (2): Linear(in_features=36, out_features=1, bias=True)
    )
  )
)

In [16]:
test_loader = DataLoader(testset, batch_size=5, shuffle=False, drop_last=False, collate_fn=dataset.collate)
_, test_mae = train.evaluate_network_sparse(model, device, test_loader, 0)
print("Train MAE: {:.4f}".format(test_mae))

Train MAE: 0.6608
