In [1]:
import os
import sys
import tqdm
import pandas as pd

In [2]:
REBADD_LIB_PATH = os.path.abspath(os.pardir)
if REBADD_LIB_PATH not in sys.path:
    sys.path = [REBADD_LIB_PATH] + sys.path

from rebadd.evaluate import evaluate_sr_nov_div

In [3]:
## Test data (navitoclax, ABT-737)
filepath_ref = os.path.join(os.pardir, 'data', 'zinc15', 'zinc15_test.txt')
df_ref = pd.read_csv(filepath_ref, header=None)

referece_smiles_iter = df_ref.iloc[:,0].values.tolist()

print(len(referece_smiles_iter))

2


In [4]:
input_dir = 'baseline'

modelnames = [
    'JTVAE',
    'MARS',
    'MolGPT',
    'RationaleRL',
    'ReLeaSE',
]

filenames = [f'smi_after.csv.{num}' for num in range(10)]

In [5]:
frames = []

for modelname in modelnames:
    
    for filename in filenames:
        
        filepath = os.path.join(input_dir, modelname, filename)
        
        df = pd.read_csv(filepath)
        
        df = df.loc[:,('smiles', 'bcl2', 'bclxl', 'bclw')]
        df.loc[:,'model'] = modelname
        df.loc[:,'checkpoint'] = filename.split('.')[-1]
        
        frames.append(df)

In [6]:
data = []

for df in tqdm.tqdm(frames):

    s_sr, s_nov, s_div = evaluate_sr_nov_div(df, referece_smiles_iter, 'bcl2_bclxl_bclw')
    
    data.append({'SR':s_sr, 'Nov':s_nov, 'Div':s_div, 'Model':df.loc[0,'model'], 'Ckpt':df.loc[0,'checkpoint']})

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:58<00:00,  1.18s/it]


In [7]:
df_records = pd.DataFrame(data)
df_records.loc[:,'HMean'] = (df_records.loc[:,'SR'] * df_records.loc[:,'Nov'] * df_records.loc[:,'Div']) ** 0.333

df_records

Unnamed: 0,SR,Nov,Div,Model,Ckpt,HMean
0,0.0002,1.0,0.0,JTVAE,0,0.0
1,0.0006,1.0,0.0,JTVAE,1,0.0
2,0.0002,1.0,0.0,JTVAE,2,0.0
3,0.0004,1.0,0.0,JTVAE,3,0.0
4,0.0,0.0,0.0,JTVAE,4,0.0
5,0.0006,1.0,0.0,JTVAE,5,0.0
6,0.0004,1.0,0.0,JTVAE,6,0.0
7,0.0002,1.0,0.0,JTVAE,7,0.0
8,0.0004,1.0,0.0,JTVAE,8,0.0
9,0.0004,1.0,0.0,JTVAE,9,0.0


In [8]:
df_records.groupby('Model').mean(numeric_only=True)

Unnamed: 0_level_0,SR,Nov,Div,HMean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JTVAE,0.00034,0.9,0.0,0.0
MARS,0.1212,1.0,0.907815,0.479499
MolGPT,0.000537,1.0,0.607643,0.058352
RationaleRL,0.342634,1.0,0.830946,0.658121
ReLeaSE,0.000229,0.7,0.144425,0.018291


In [9]:
df_records.groupby('Model').std(numeric_only=True)

Unnamed: 0_level_0,SR,Nov,Div,HMean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JTVAE,0.00019,0.316228,0.0,0.0
MARS,0.0033,0.0,0.000303,0.004353
MolGPT,0.000279,0.0,0.422914,0.04084
RationaleRL,0.005235,0.0,0.000848,0.003352
ReLeaSE,0.000207,0.483046,0.244225,0.029646
