In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
import mlflow
from mlflow.models import infer_signature

import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import global_mean_pool
from torch_geometric.loader import DataLoader
from splitter import random_split, scaffold_split


from datasets.molnet import MoleculeDataset
from model.gnn import GNN
from model.mlp import MLP

def seed_all(seed):
    if not seed:
        seed = 0
    print("[ Using Seed : ", seed, " ]")
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    return

def get_num_task(dataset):
    # Get output dimensions of different tasks
    if dataset == 'mdck':
        return 1
    
def compute_mean_mad(values):
    meann = torch.mean(values)
    mad = torch.std(values)
    return meann, mad

def train_general(model, device, loader, optimizer):
    model.train()
    output_layer.train()
    total_loss = 0

    for step, batch in enumerate(loader):
        batch = batch.to(device)
        h = global_mean_pool(model(batch), batch.batch)
        pred = output_layer(h)
        
        y = batch.y.view(pred.shape).float()
        y = ((y-meann)/mad)
        loss = reg_criterion(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.detach().item()

    return total_loss / len(loader)


def eval_general(model, device, loader):
    model.eval()
    output_layer.eval()
    y_true, y_pred = [], []

    for step, batch in enumerate(loader):
        batch = batch.to(device)
        with torch.no_grad():
            h = global_mean_pool(model(batch), batch.batch)
            pred = output_layer(h)
    
        true = batch.y.view(pred.shape).float()
        y_true.append(true)
        y_pred.append(pred)


    y_true = torch.cat(y_true, dim=0).cpu().numpy()
    y_pred = (torch.cat(y_pred, dim=0)*mad + meann).cpu().numpy()

    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    pearson_r = np.corrcoef(y_true.T, y_pred.T)[1,0]
    return {'RMSE': rmse, 'MAE': mae, 'pearson_R': pearson_r}, y_true, y_pred    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings

# Suppress specific UserWarning from torch_geometric
warnings.filterwarnings(
    action='ignore', 
    category=UserWarning, 
    message="It is not recommended to directly access the internal storage format `data`"
)

In [3]:
# Set a MLflow Experiment
# mlflow.end_run()
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("GIN MDCK regression")

# MLflow tag
mlflow_tag = "GIN adapted from Molscaling code, no annealing, with pretraining, with output_layer train/eval statements"

# Set hyperparameters
dataset_name = 'mdck'
num_tasks = get_num_task(dataset_name)
split = 'random'
batch_size = 256
num_layer = 5
emb_dim = 300
dropout_ratio = 0.5
lr = 1e-3
epochs = 1000
pubchem_pretrain = False
params = {
    'dataset_name': dataset_name,
    'split': split,
    'batch_size': batch_size,
    'num_layer': num_layer,
    'emb_dim': emb_dim,
    'dropout_ratio': dropout_ratio,
    'lr': lr,
    'epochs': epochs,
    'pubchem_pretrain': pubchem_pretrain
}

# Set your dataset directory
dataset_folder = '/home/ubuntu/adme/MolScaling/datasets/molecule_net/'
dataset = MoleculeDataset(dataset_folder + dataset_name, dataset=dataset_name)

# Set device and seed
device = torch.device('cuda') 
seed = 0
seed_all(seed)
torch.cuda.manual_seed_all(seed)


# Initalize model
model_param_group = []
model = GNN(num_layer=num_layer, emb_dim=emb_dim, drop_ratio=dropout_ratio).to(device)
output_layer = MLP(in_channels=emb_dim, hidden_channels=emb_dim, 
                    out_channels=num_tasks, num_layers=1, dropout=0).to(device)

if pubchem_pretrain:
    output_model_dir = '/home/ubuntu/adme/MolScaling/model_saved/'
    model_root = 'PubChem_Pretrained.pth'
    model.load_state_dict(torch.load(output_model_dir + model_root, map_location='cuda:0'))
    print('======= Model Loaded =======')
    
model_param_group.append({'params': output_layer.parameters(),'lr': lr})
model_param_group.append({'params': model.parameters(), 'lr': lr})
print(model)                

# Initalize optimizer and metrics
optimizer = optim.Adam(model_param_group, lr=lr, weight_decay=0)
reg_criterion = torch.nn.MSELoss()
train_result_list, val_result_list, test_result_list = [], [], []
metric_list = ['RMSE', 'MAE', 'pearson_R']
best_val_mae, best_val_idx = 1e10, 0

# Split data
if split == 'scaffold':
    smiles_list = pd.read_csv(dataset_folder + dataset_name + '/processed/smiles.csv',
                                header=None)[0].tolist()
    train_dataset, valid_dataset, test_dataset, (train_smiles, valid_smiles, test_smiles), (_,_,_) = scaffold_split(
        dataset, smiles_list, null_value=0, frac_train=0.8,frac_valid=0.2, frac_test=0, return_smiles=True)
        # dataset, smiles_list, null_value=0, frac_train=0.8,frac_valid=0.1, frac_test=0.1, return_smiles=True)
    print('split via scaffold')
elif split == 'random':
    smiles_list = pd.read_csv(dataset_folder + dataset_name + '/processed/smiles.csv',
                                header=None)[0].tolist()
    train_dataset, valid_dataset, test_dataset, (train_smiles, valid_smiles, test_smiles),_ = random_split(
        dataset, null_value=0, frac_train=0.8, frac_valid=0.2, frac_test=0, seed=seed, smiles_list=smiles_list)
        # dataset, null_value=0, frac_train=0.8, frac_valid=0.1, frac_test=0.1, seed=seed, smiles_list=smiles_list)
    print('randomly split')

# Set dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=8)
val_loader = DataLoader(valid_dataset, batch_size=batch_size,
                        shuffle=False, num_workers=8)
# test_loader = DataLoader(test_dataset, batch_size=batch_size,
#                             shuffle=False, num_workers=8)

meann, mad = compute_mean_mad(train_dataset.data.y)
train_func = train_general
eval_func = eval_general

with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.set_tag("Description", mlflow_tag)
    
    for epoch in range(1, epochs + 1):
        loss = train_func(model, device, train_loader, optimizer)
        print('Epoch: {}\nLoss: {}'.format(epoch, loss))

        train_result, train_target, train_pred = eval_func(model, device, train_loader)
        train_result_list.append(train_result)
        
        # test_result, test_target, test_pred = eval_func(model, device, test_loader)
        # test_result_list.append(test_result)
        
        val_result, val_target, val_pred = eval_func(model, device, val_loader)
        val_result_list.append(val_result)

        for metric in metric_list:
            print('{} train: {:.6f}\tval: {:.6f}'.format(metric, train_result[metric], val_result[metric]))
            # print('{} train: {:.6f}\tval: {:.6f}\ttest: {:.6f}'.format(metric, train_result[metric], val_result[metric], test_result[metric]))

        if val_result['MAE'] < best_val_mae:
            best_val_mae = val_result['MAE']
            best_val_idx = epoch - 1
            

        train_result = {'train_' + key: value for key, value in train_result.items()}
        mlflow.log_metrics(train_result, step=epoch)
        mlflow.log_metrics(val_result, step=epoch)
        mlflow.log_metric('train_loss', loss, step=epoch)
        
    # signature = infer_signature(train_dataset, train_pred)
    model_info = mlflow.pytorch.log_model(model, "model")
    model_info = mlflow.pytorch.log_model(output_layer, "output_layer")

for metric in metric_list:
    print('Best (RMSE), {} train: {:.6f}\tval: {:.6f}'.format(
        metric, train_result_list[best_val_idx][metric], val_result_list[best_val_idx][metric]))
    # print('Best (RMSE), {} train: {:.6f}\tval: {:.6f}\ttest: {:.6f}'.format(
        # metric, train_result_list[best_val_idx][metric], val_result_list[best_val_idx][metric], test_result_list[best_val_idx][metric]))

Dataset: mdck
Data: Data(x=[61735, 2], edge_index=[2, 134804], edge_attr=[134804, 2], id=[2642], fingerprint=[2642, 1024], y=[2642])
[ Using Seed :  0  ]
GNN(
  (x_embedding1): Embedding(120, 300)
  (x_embedding2): Embedding(3, 300)
  (gnns): ModuleList(
    (0-4): 5 x GINConv()
  )
  (batch_norms): ModuleList(
    (0-4): 5 x BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)
randomly split
Epoch: 1
Loss: 1.1837658948368497
RMSE train: 0.684476	val: 0.662529
MAE train: 0.591488	val: 0.569687
pearson_R train: 0.284430	val: 0.295137
Epoch: 2
Loss: 0.9499290320608351
RMSE train: 0.773332	val: 0.765868
MAE train: 0.691688	val: 0.680744
pearson_R train: -0.003733	val: -0.002807
Epoch: 3
Loss: 0.8925008575121561
RMSE train: 1.037986	val: 1.050575
MAE train: 0.914821	val: 0.932489
pearson_R train: 0.279908	val: 0.292110
Epoch: 4
Loss: 0.8395944436391195
RMSE train: 1.243719	val: 1.266300
MAE train: 1.096487	val: 1.131010
pearson_R train: 0.265974	val: 0.275



RMSE train: 0.110679	val: 0.594863
MAE train: 0.085536	val: 0.437166
pearson_R train: 0.988805	val: 0.579770




Best (RMSE), RMSE train: 0.326085	val: 0.551819
Best (RMSE), MAE train: 0.247578	val: 0.404077
Best (RMSE), pearson_R train: 0.896541	val: 0.598786


