In [1]:
"""
This script load a given model and generates 
model analysis. First, it prints basic stats of 
how the model over or under predicts.
The it outputs the actual vs predicted yields
alsong with the given smiles for each sample
It also generates the GNN activations for each
input and plots the molecule with highly activated 
areas.
"""

'\nThis script load a given model and generates \nmodel analysis. First, it prints basic stats of \nhow the model over or under predicts.\nThe it outputs the actual vs predicted yields\nalsong with the given smiles for each sample\nIt also generates the GNN activations for each\ninput and plots the molecule with highly activated \nareas.\n'

In [1]:
%cd /afs/crc.nd.edu/user/m/msaebi/Public/chemistry/yield_rxn

/afs/crc.nd.edu/user/m/msaebi/Public/chemistry/yield_rxn


In [2]:
import os
import sys
import json
import warnings
import argparse
import logging
import pickle
import pandas as pd
import numpy as np
from collections import defaultdict

from IPython.display import SVG

import rdkit
import rdkit.Chem as Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import DrawingOptions
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw import DrawingOptions


import torch
import torch.optim as opt
from torch.utils.data import DataLoader
from torch.utils.data import Dataset,Subset

from rxntorch.containers.reaction import Rxn
from rxntorch.containers.dataset import RxnGraphDataset as RxnGD
from rxntorch.utils import collate_fn
from rxntorch.models.yield_network import YieldNet, YieldTrainer
from rxntorch.models import yield_network

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score,r2_score

import scripts.load_utils as lu

In [3]:
parser = argparse.ArgumentParser()

parser.add_argument("-p", "--dataset_path", type=str, default='./data/', help="train dataset")
parser.add_argument("-dn", "--dataset_name", required=True, type=str, help="dataset name. Options: az (AstraZeneca),dy (Doyle),su (Suzuki)")
parser.add_argument("-op", "--output_path", type=str, default='./output/', help="saved model path")
parser.add_argument("-o", "--output_name", required=False, type=str, help="e.g. rxntorch.model")
parser.add_argument("-sn", "--split_set_num", type=int, default=1, help="Choose one split set for train and test. Options: 1-10")
parser.add_argument("-mv", "--model_version", type=str, default='5.3.1-learn-w', help="Choose the model version")


parser.add_argument("-dr", "--dropout_rate", type=float, default=0.04, help="Ratio of samples to reserve for valid data")
parser.add_argument("-b", "--batch_size", type=int, default=40, help="number of batch_size")
parser.add_argument("-tb", "--test_batch_size", type=int, default=1, help="batch size for evaluation")
parser.add_argument("-e", "--epochs", type=int, default=200, help="number of epochs")
parser.add_argument("-hs", "--hidden", type=int, default=200, help="hidden size of model layers")
parser.add_argument("-l", "--layers", type=int, default=2, help="number of layers")

parser.add_argument("--lr", type=float, default=1e-2, help="learning rate of the optimizer")
parser.add_argument("-lrd", "--lr_decay", type=float, default=0.5, help="Decay factor for reducing the learning rate")
parser.add_argument("-lrs", "--lr_steps", type=int, default=10000,help="Number of steps between learning rate decay")

parser.add_argument("-awd","--adam_weight_decay", type=float, default=0.0, help="weight_decay of adam")
parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam second beta value")

parser.add_argument("-gc", "--grad_clip", type=float, default=None, help="value for gradient clipping")
parser.add_argument("-pw", "--pos_weight", type=float, default=None, help="Weights positive samples for imbalance")

parser.add_argument("-w", "--num_workers", type=int, default=4, help="dataloader worker size")
parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false")
parser.add_argument("--cuda_devices", type=int, nargs='*', default=None, help="CUDA device ids")

parser.add_argument("--log_freq", type=int, default=100, help="printing loss every n iter: setting n")
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument("-ud","--use_domain", type=str, required=True, help="use domain features or not. options: rdkit: combination od rdkit feature and bozhao features. no_rdkit: only bozhao features. no_domain: neither.")
parser.add_argument("-mb","--max_nbonds", type=int, default=15, help="maximum number of bonds for binary features")
parser.add_argument("-ma","--max_natoms", type=int, default=15, help="maximum number of atoms for binary features")
parser.add_argument("--abs", type=str, default='abs', help="Take the average over aboslute/no absolute/sigmoid/relu value of predicted yield")


_StoreAction(option_strings=['--abs'], dest='abs', nargs=None, const=None, default='abs', type=<class 'str'>, choices=None, help='Take the average over aboslute/no absolute/sigmoid/relu value of predicted yield', metavar=None)

In [4]:
warnings.filterwarnings("ignore")
dy_args = {"dn":"dy" ,"layers":"2", "mv":'model_5.3.1-learn-w', "hidden":'200',"epochs":'200',"sn":"4", "domain":"rdkit"}
su_args = {"dn":"su" ,"layers":"2",  "mv":'model_5.3.1-learn-w', "hidden":'200',"epochs":'200',"sn":"9", "domain":"rdkit"}
az_args = {"dn":"az","layers":"1",  "mv":'model_5.3.1-learn-w', "hidden":'100',"epochs":'100',"sn":"3","domain":"no_domain"}

args_dict= su_args
args = parser.parse_args(args=["-dn", args_dict["dn"],
                               "-mv" ,args_dict["mv"],
                               "--batch_size",'40',
                              "--use_domain",args_dict["domain"],
                              "-gc","0.8",
                               "--layers",args_dict["layers"],
                               "--hidden",args_dict["hidden"],
                               "--epochs",args_dict["epochs"],
                               "-sn" ,args_dict["sn"]])

In [5]:
torch.manual_seed(args.seed )
torch.cuda.manual_seed_all(args.seed )

#input specs
data_type=args.dataset_name
gc= 'gc' if args.grad_clip else ''
Abs=args.abs 
model_dir=  os.path.join('output',args.model_version)
model_name = '-'.join(map(str,[data_type+'_'+args.model_version, gc, args.use_domain, Abs, 'set',args.split_set_num,
                               args.hidden, args.layers, args.epochs, args.lr, args.lr_decay, 
                               args.lr_steps,args.batch_size]))

model_path= os.path.join(model_dir,model_name)
if not os.path.exists(model_path):
    raise Exception('model \n'+model_path+'\ndoesn\'t exsist')
    
else:   
    betas=(args.adam_beta1,args.adam_beta2)
    device = torch.device('cpu')

    model = torch.load(model_path+'/yield.model', map_location=device)
    optimizer = opt.Adam(model.parameters(), lr=args.lr, betas=betas, weight_decay=0)
    print("Model loaded from:\n ",model_path)
    
#if use_domain=no_domain, just load either rdkit or no_rdkit .csv file and the
#set domain features to 0.

ext= '_'+args.use_domain if 'rdkit' in args.use_domain else '_no_rdkit' 
data_path = os.path.join(args.dataset_path,data_type)
processed_path = os.path.join(data_path,'processed')

input_split_idx_file = os.path.join(processed_path,'train_test_idxs.pickle')
processed_data_file = os.path.join(processed_path,''.join([data_type, ext,'.csv']))
selected_features_fn = os.path.join(data_path,'rf_results','selected_feats.txt')

#output specs
# Saves model predictions and summary to the model_res folder.
model_res= os.path.join(model_dir,model_name,'model_res')
model_preds_test_fn = os.path.join(model_res,'model_preds_test.csv')
model_preds_train_fn = os.path.join(model_res,'model_preds_train.csv')
model_summary_fn= os.path.join(model_res,'model_summary.pickle')

if not os.path.exists(model_res):
    os.makedirs(model_res)

Model loaded from:
  output/model_5.3.1-learn-w/su_model_5.3.1-learn-w-gc-rdkit-abs-set-9-200-2-200-0.01-0.5-10000-40


In [6]:
model_res

'output/model_5.3.1-learn-w/su_model_5.3.1-learn-w-gc-rdkit-abs-set-9-200-2-200-0.01-0.5-10000-40/model_res'

In [7]:
################################################################
#load_train_test sets
#################################################################
split_set_num= args.split_set_num

with open(input_split_idx_file, 'rb') as handle:
    idx_dict = pickle.load(handle)
    
selected_features = open(selected_features_fn,'r').readlines()[0].split(',')

print("Loading Dataset in {dataset}".format( dataset=processed_data_file))
print("Using the split set number {split}".format( split=split_set_num))

df=pd.read_csv(processed_data_file,index_col=0)
train_set= df.iloc[idx_dict['train_idx'][split_set_num]]
test_set = df.iloc[idx_dict['test_idx'][split_set_num]]

smiles_feature_names = ["id","yield","reactant_smiles","solvent_smiles","base_smiles","product_smiles"]
domain_feature_names = [f for f in df.columns if f not in smiles_feature_names]

#apply feature selection
print("Number of all available features: {num}".format(num=len(domain_feature_names)))
if args.use_domain=='rdkit':
    domain_feature_names = [f for f in domain_feature_names if f in selected_features]
    print("Selecting features...")
    print("Number of features after feature selection: {num}".format(num=len(domain_feature_names)))
else:
    print("Not running feature selection!")


train_set_domain = train_set[domain_feature_names]
test_set_domain = test_set[domain_feature_names]
train_set_smiles = train_set[smiles_feature_names]
test_set_smiles = test_set[smiles_feature_names]

scaler = StandardScaler()

train_set_domain_scaled = pd.DataFrame(scaler.fit_transform(train_set_domain),columns = domain_feature_names)
test_set_domain_scaled = pd.DataFrame(scaler.transform(test_set_domain),columns = domain_feature_names)

assert train_set_domain.shape[0]  == train_set_smiles.shape[0] == train_set_domain_scaled.shape[0]
assert test_set_domain.shape[0]  == test_set_smiles.shape[0] == test_set_domain_scaled.shape[0]

Loading Dataset in ./data/su/processed/su_rdkit.csv
Using the split set number 9
Number of all available features: 752
Selecting features...
Number of features after feature selection: 682


In [8]:
print("{:-^80}".format("Dataset"))
#feeding train smiles to test and vice versa to make sure our encoding is consistent.
# no label information is used on test set here.
train_dataset = RxnGD(train_set_domain_scaled,train_set_smiles, test_set_smiles, args.max_nbonds, args.max_natoms, args.use_domain)
test_dataset = RxnGD(test_set_domain_scaled, test_set_smiles, train_set_smiles, args.max_nbonds, args.max_natoms, args.use_domain)

                   
sample = train_dataset[3]
afeats_size, bfeats_size, binary_size, dmfeats_size = (sample["atom_feats"].shape[-1], sample["bond_feats"].shape[-1],
                                        sample["binary_feats"].shape[-1], sample['domain_feats'].shape[-1])
d1,d2,d3 = sample["binary_feats"].shape
binary_size= d3*d2

print("{:d} samples for training ,{:d} samples for testing".format(train_set.shape[0], test_set.shape[0]))
train_dataloader = DataLoader(train_dataset, batch_size=args.test_batch_size, num_workers=args.num_workers, shuffle=True,
                              collate_fn=collate_fn, drop_last=True)

test_dataloader = DataLoader(test_dataset, batch_size=args.test_batch_size, num_workers=args.num_workers, 
                             collate_fn=collate_fn,drop_last=True)


print("{:-^80}".format("Model"))
print("Graph convolution layers: {}  Hidden size: {}".format(
    args.layers, args.hidden, args.batch_size, args.epochs))

------------------------------------Dataset-------------------------------------
3180 samples for training ,1363 samples for testing
-------------------------------------Model--------------------------------------
Graph convolution layers: 2  Hidden size: 200


### Dump domain weights

In [9]:
domain_weights= model.yield_scoring.domain.weight
sorted_feat_names_weights = lu.get_domain_weights(domain_weights,domain_feature_names, model_res)

Domain weights written to output/model_5.3.1-learn-w/su_model_5.3.1-learn-w-gc-rdkit-abs-set-9-200-2-200-0.01-0.5-10000-40/model_res/domain_weights.txt


### Generate Test set analysis

In [10]:

correct_yields_test, pred_yields_test, data_dict_test,r2score = lu.get_model_outputs(model,test_dataloader,test_set_smiles)
lu.get_basic_stats(data_dict_test,model_res,r2score,'Test')
lu.write_model_preds(model_preds_test_fn,data_dict_test)
lu.plot_activation_all_data(data_dict_test,'test',model_res)


Test set
R2 score: 0.8275432701519169
Num low-yield: 1038
Num high-yield: 325
Percentage of lows --> high: 3.37%
Percentage of lows --> low: 96.63%
Percentage of highs --> high: 79.69%
Percentage of highs --> low: 20.31%

Writing actual and predicted yield values for each sample...
Done!

Generating activation figures...
Done!


### Generate Train set analysis

In [11]:
correct_yields_train, pred_yields_train, data_dict_train,r2score= lu.get_model_outputs(model,train_dataloader,train_set_smiles)
lu.get_basic_stats(data_dict_train,model_res,r2score, 'Train')
lu.write_model_preds(model_preds_train_fn,data_dict_train)
lu.plot_activation_all_data(data_dict_train,'train',model_res)

Train set
R2 score: 0.946601495845975
Num low-yield: 2458
Num high-yield: 722
Percentage of lows --> high: 2.20%
Percentage of lows --> low: 97.80%
Percentage of highs --> high: 85.18%
Percentage of highs --> low: 14.82%

Writing actual and predicted yield values for each sample...
Done!

Generating activation figures...
Done!
