In [1]:
import os
import shutil
import sys
import yaml
import numpy as np
import pandas as pd
import random
from datetime import datetime

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error, root_mean_squared_error


from dataset.dataset_test import MolTestDatasetWrapper

from dataset.get_config import get_config 
import argparse
from torch_geometric.utils import  scatter, softmax

from torch_geometric.data import Data

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
apex_support = False
try:
    sys.path.append('./apex')
    from apex import amp

    apex_support = True
except:
    print("Please install apex for mixed precision training from: https://github.com/NVIDIA/apex")
    apex_support = False


def _save_config_file(model_checkpoints_folder):
    if not os.path.exists(model_checkpoints_folder):
        os.makedirs(model_checkpoints_folder)
        shutil.copy('./config_finetune.yaml', os.path.join(model_checkpoints_folder, 'config_finetune.yaml'))

def get_roc_auc_score(y_true, y_pred, is_valid):
    roc_list = []
    for i in range(y_true.shape[1]):
        #AUC is only defined when there is at least one positive data.
        if np.sum(y_true[:,i] == 1) > 0 and np.sum(y_true[:,i] == -1) > 0:
            is_valid = y_true[:,i]**2 > 0
            roc_list.append(roc_auc_score((y_true[is_valid,i] + 1)/2, y_pred[is_valid,i]))

    if len(roc_list) < y_true.shape[1]:
        print("Some target is missing!")
        print("Missing ratio: %f" %(1 - float(len(roc_list))/y_true.shape[1]))

    return  sum(roc_list)/len(roc_list)




class Normalizer(object):
    """Normalize a Tensor and restore it later. """

    def __init__(self, tensor):
        """tensor is taken as a sample to calculate the mean and std"""
        self.mean = torch.mean(tensor)
        self.std = torch.std(tensor)

    def norm(self, tensor):
        return (tensor - self.mean) / self.std

    def denorm(self, normed_tensor):
        return normed_tensor * self.std + self.mean

    def state_dict(self):
        return {'mean': self.mean,
                'std': self.std}

    def load_state_dict(self, state_dict):
        self.mean = state_dict['mean']
        self.std = state_dict['std']


Please install apex for mixed precision training from: https://github.com/NVIDIA/apex


In [36]:
class Args:
    batch_size = 4
    epochs = 200
    init_lr = 0.0005
    init_base_lr = 0.0001
    weight_decay = 1e-6
    
    gpu = 'cuda:1'
    model_type = 'gin'
    num_layer = 5
    emb_dim = 300
    feat_dim = 300
    dropout = 0.3
    pool = 'mean'
    seed = '42'

    task_name = 'bace'
    splitting = 'scaffold'
    random_masking = 1
    mask_rate = 3
    mask_edge = 0 
    alpha = 0.1
    reduceTrain = 1

args = Args()

In [37]:
config = yaml.load(open("config_finetune.yaml", "r"), Loader=yaml.FullLoader)

In [38]:
seed = args.seed

In [39]:
config['batch_size'] = args.batch_size
config['epochs'] = args.epochs
config['init_lr'] = args.init_lr
config['init_base_lr'] = args.init_base_lr
# config['weight_decay'] = args.weight_decay
config['gpu'] = args.gpu   
config['model']['num_layer'] = args.num_layer
config['model']['emb_dim'] = args.emb_dim
config['model']['feat_dim'] = args.feat_dim
config['model']['drop_ratio'] = args.dropout
config['model']['pool'] = args.pool

config['task_name'] = args.task_name
config['dataset']['seed'] = seed

config['dataset']['splitting'] = args.splitting

In [40]:
seed =  int(args.seed)

In [41]:
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    config['task_name'] = config['task_name'].lower()

In [42]:
config = get_config(config)

### main으로 넘어옴

In [43]:
dataset = MolTestDatasetWrapper(config['batch_size'],
                                **config['dataset'],
                                random_masking=args.random_masking,
                                mask_rate=args.mask_rate,
                                mask_edge=args.mask_edge)

## step 과 test 옮기기

In [44]:
    def _get_device():
        if torch.cuda.is_available() and config['gpu'] != 'cpu':
            device = config['gpu']
            torch.cuda.set_device(device)
            args.deviceName = "cuda" + str(device[-1])

        else:
            device = 'cpu'
            args.deviceName = 'cpu'

        print("Running on:", device)

        return device

In [45]:
device = _get_device()

Running on: cuda:1


In [46]:
savefilename = 'finetune_recon_sub'

In [47]:
layout = {
    "recon": {
        "loss_end": ["Multiline", ["loss_end/train", "loss_end/validation"]],
        "loss_recon_node" : ["Multiline", ["loss_recon_node/train"]],
        "loss_recon_edge" : ["Multiline", ["loss_recon_edge/train"]],
        "loss_total" : ["Multiline", ["loss_total/train", "loss_total/validation"]],
        "accuracy": ["Multiline", [ "accuracy/validation"]],
    },
}

In [48]:
current_time = datetime.now().strftime('%b%d_%H-%M-%S')
dir_name =  savefilename + config['task_name'] + '_' + str(args.num_layer) + '_' \
+ str(args.emb_dim) + '_' + str(args.feat_dim)  + '_' + str(args.dropout) + '_' \
+ str(args.splitting) + '_' + str(args.deviceName) + '_' + str(args.seed) + '_' + str(current_time)

log_dir = os.path.join('finetune', dir_name)
writer = SummaryWriter(log_dir=log_dir)
writer.add_custom_scalars(layout)

dataset = dataset
if config['dataset']['task'] == 'classification':
    criterion =  nn.BCEWithLogitsLoss(reduction = "none")
elif config['dataset']['task'] == 'regression':
    if config["task_name"] in ['qm7', 'qm8', 'qm9']:
        criterion = nn.L1Loss()
    else:
        criterion = nn.MSELoss()
criterion_recon = nn.CrossEntropyLoss()

In [49]:
config['model']['mask_rate'] = args.mask_rate

In [50]:

import torch
from torch import nn
import torch.nn.functional as F

from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops
from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool

class GINEConv(MessagePassing):
    def __init__(self, emb_dim):
        super(GINEConv, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, 2*emb_dim), 
            nn.ReLU(), 
            nn.Linear(2*emb_dim, emb_dim)
        )
        self.edge_embedding1 = nn.Embedding(num_bond_type, emb_dim)
        self.edge_embedding2 = nn.Embedding(num_bond_direction, emb_dim)

        nn.init.xavier_uniform_(self.edge_embedding1.weight.data)
        nn.init.xavier_uniform_(self.edge_embedding2.weight.data)

    def forward(self, x, edge_index, edge_attr):
        # add self loops in the edge space
        edge_index = add_self_loops(edge_index, num_nodes=x.size(0))[0]

        # add features corresponding to self-loop edges.
        self_loop_attr = torch.zeros(x.size(0), 2)
        self_loop_attr[:,0] = 4 # bond type for self-loop edge
        self_loop_attr = self_loop_attr.to(edge_attr.device).to(edge_attr.dtype)
        edge_attr = torch.cat((edge_attr, self_loop_attr), dim=0)

        edge_embeddings = self.edge_embedding1(edge_attr[:,0]) + \
            self.edge_embedding2(edge_attr[:,1])

        return self.propagate(edge_index, x=x, edge_attr=edge_embeddings)

    def message(self, x_j, edge_attr):
        return x_j + edge_attr

    def update(self, aggr_out):
        return self.mlp(aggr_out)

In [52]:
train_loader, valid_loader, test_loader = dataset.get_data_loaders()

1512
About to generate scaffolds
Generating scaffold 0/1512
Generating scaffold 1000/1512
About to sort in scaffold sets


In [53]:
train_loader

<torch_geometric.deprecation.DataLoader at 0x7effb5dc9ea0>

In [32]:
all_batches = [batch for batch in train_loader]

In [92]:
train_loader_iterator = iter(train_loader)
data = next(train_loader_iterator)  # 첫 번째 배치를 가져옵니다.

In [93]:
num_atom_type = 119 # including the extra mask tokens
num_chirality_tag = 3

num_bond_type = 5 # including aromatic and self-loop edge
num_bond_direction = 3 

emb_dim = 300
x_embedding1 = nn.Embedding(num_atom_type, emb_dim)
x_embedding2 = nn.Embedding(num_chirality_tag, emb_dim)
x_embedding3 = nn.Linear(40, emb_dim)
gnns = [GINEConv(emb_dim) for _ in range(5)]
batch_norms = [nn.BatchNorm1d(emb_dim) for _ in range(5)]
pool = global_mean_pool
feat_lin = nn.Linear(emb_dim + 200, 512)
pred_head = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 1))
dense_score = nn.Linear(emb_dim, 1)

# Example usage
# Assume 'data' is a data object with necessary fields
embeddings = (x_embedding1, x_embedding2, x_embedding3)

In [105]:
x = data.x
edge_index = data.edge_index
edge_attr = data.edge_attr


In [106]:
 h = x_embedding1(x[:,0].long()) + x_embedding2(x[:,1].long()) 

In [107]:
h

tensor([[-2.6367,  1.3134,  0.4415,  ..., -1.7389, -0.8103, -0.0591],
        [-1.7949,  2.2441,  0.5851,  ..., -0.3990,  0.8168, -1.0163],
        [-2.6367,  1.3134,  0.4415,  ..., -1.7389, -0.8103, -0.0591],
        ...,
        [-2.0584,  0.6337, -0.7903,  ..., -0.5689, -3.1688, -1.2781],
        [-2.0584,  0.6337, -0.7903,  ..., -0.5689, -3.1688, -1.2781],
        [-2.0584,  0.6337, -0.7903,  ..., -0.5689, -3.1688, -1.2781]],
       grad_fn=<AddBackward0>)

num_layer = 5

In [108]:
drop_ratio = 0.3

In [109]:
for layer in range(num_layer):
    h = gnns[layer](h, edge_index, edge_attr)
    h = batch_norms[layer](h)
    if layer == num_layer - 1:
        h = F.dropout(h, drop_ratio, )
    else:
        h = F.dropout(F.relu(h), drop_ratio,)

h_node = h

In [111]:
h = pool(h_node, data.batch)

In [112]:
score = dense_score(h_node).squeeze()

In [118]:
mask_rate = 3

In [119]:
from util import topk

node = topk(score.squeeze(), mask_rate, batch = data.batch)

In [89]:
max_hop = 1

In [101]:
data.batch 

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [121]:
num_nodes = scatter(data.batch.new_ones(x.size(0)), data.batch, reduce='sum')

In [123]:
max_nodes = num_nodes

In [None]:
start_node = 