In [1]:
import os
import sys
import tqdm
import pandas as pd

In [2]:
REBADD_LIB_PATH = os.path.abspath(os.pardir)
if REBADD_LIB_PATH not in sys.path:
    sys.path = [REBADD_LIB_PATH] + sys.path

from rebadd.evaluate import evaluate_sr_nov_div

In [3]:
filepath_ref = os.path.join(os.pardir, 'data', 'chembl', 'chembl_test_full.csv')
df_ref = pd.read_csv(filepath_ref)
referece_smiles_iter = df_ref.loc[:,'smiles'].values.tolist()

print(len(referece_smiles_iter))

315


In [4]:
input_dir = 'outputs_7_calculate_properties_generated'

filenames = [f'smi_after.csv.{num}' for num in range(10)]

## gsk3b_jnk3_qed_sa

In [5]:
frames = []

for modelname in ['gsk3_jnk3_qed_sa']:
    
    for filename in filenames:
        
        filepath = os.path.join(input_dir, modelname, filename)
        
        df = pd.read_csv(filepath)
        
        df = df.loc[:,('smiles', 'gsk3b', 'jnk3', 'qed', 'sa')]
        df.loc[:,'model'] = modelname
        df.loc[:,'checkpoint'] = filename.split('.')[-1]
        
        frames.append(df)
        
        
data = []

for df in tqdm.tqdm(frames):

    s_sr, s_nov, s_div = evaluate_sr_nov_div(df, referece_smiles_iter, 'gsk3b_jnk3_qed_sa')
    
    data.append({'SR':s_sr, 'Nov':s_nov, 'Div':s_div, 'Model':df.loc[0,'model'], 'Ckpt':df.loc[0,'checkpoint']})
    

df_records = pd.DataFrame(data)
df_records.loc[:,'HMean'] = (df_records.loc[:,'SR'] * df_records.loc[:,'Nov'] * df_records.loc[:,'Div']) ** 0.333

print(df_records.groupby('Model').mean(numeric_only=True))
print(df_records.groupby('Model').std(numeric_only=True))

100%|██████████| 10/10 [00:09<00:00,  1.02it/s]

                       SR       Nov       Div     HMean
Model                                                  
gsk3_jnk3_qed_sa  0.84136  0.692683  0.674449  0.732726
                       SR      Nov       Div     HMean
Model                                                 
gsk3_jnk3_qed_sa  0.00743  0.01321  0.001538  0.006081





In [6]:
frames = []

for modelname in ['MARS', 'MolSearch']:

    filepath = os.path.join('baseline', f'{modelname}.txt')

    df = pd.read_csv(filepath)

    df = df.loc[:,('smiles', 'gsk3b', 'jnk3', 'qed', 'sa')]
    df.loc[:,'model'] = modelname

    frames.append(df)

        
data = []

for df in tqdm.tqdm(frames):

    s_sr, s_nov, s_div = evaluate_sr_nov_div(df, referece_smiles_iter, 'gsk3b_jnk3_qed_sa')
    
    data.append({'SR':s_sr, 'Nov':s_nov, 'Div':s_div, 'Model':df.loc[0,'model']})
    

df_records = pd.DataFrame(data)
df_records.loc[:,'HMean'] = (df_records.loc[:,'SR'] * df_records.loc[:,'Nov'] * df_records.loc[:,'Div']) ** 0.333

df_records

100%|██████████| 2/2 [00:01<00:00,  1.01it/s]


Unnamed: 0,SR,Nov,Div,Model,HMean
0,0.3958,0.756707,0.734864,MARS,0.604073
1,0.231696,0.5272,0.774141,MolSearch,0.455945


## gsk3b_jnk3

In [8]:
frames = []

for modelname in ['gsk3_jnk3']:
    
    for filename in filenames:
        
        filepath = os.path.join(input_dir, modelname, filename)
        
        df = pd.read_csv(filepath)
        
        df = df.loc[:,('smiles', 'gsk3b', 'jnk3', 'qed', 'sa')]
        df.loc[:,'model'] = modelname
        df.loc[:,'checkpoint'] = filename.split('.')[-1]
        
        frames.append(df)
        
        
data = []

for df in tqdm.tqdm(frames):

    s_sr, s_nov, s_div = evaluate_sr_nov_div(df, referece_smiles_iter, 'gsk3b_jnk3')
    
    data.append({'SR':s_sr, 'Nov':s_nov, 'Div':s_div, 'Model':df.loc[0,'model'], 'Ckpt':df.loc[0,'checkpoint']})
    

df_records = pd.DataFrame(data)
df_records.loc[:,'HMean'] = (df_records.loc[:,'SR'] * df_records.loc[:,'Nov'] * df_records.loc[:,'Div']) ** 0.333

print(df_records.groupby('Model').mean(numeric_only=True))
print(df_records.groupby('Model').std(numeric_only=True))

100%|██████████| 10/10 [00:50<00:00,  5.07s/it]

               SR       Nov       Div     HMean
Model                                          
gsk3_jnk3  0.9443  0.703007  0.682842  0.768372
                 SR       Nov       Div     HMean
Model                                            
gsk3_jnk3  0.003367  0.007993  0.001237  0.003042





## gsk3b

In [9]:
frames = []

for modelname in ['gsk3']:
    
    for filename in filenames:
        
        filepath = os.path.join(input_dir, modelname, filename)
        
        df = pd.read_csv(filepath)
        
        df = df.loc[:,('smiles', 'gsk3b', 'jnk3', 'qed', 'sa')]
        df.loc[:,'model'] = modelname
        df.loc[:,'checkpoint'] = filename.split('.')[-1]
        
        frames.append(df)
        
        
data = []

for df in tqdm.tqdm(frames):

    s_sr, s_nov, s_div = evaluate_sr_nov_div(df, referece_smiles_iter, 'gsk3b')
    
    data.append({'SR':s_sr, 'Nov':s_nov, 'Div':s_div, 'Model':df.loc[0,'model'], 'Ckpt':df.loc[0,'checkpoint']})
    

df_records = pd.DataFrame(data)
df_records.loc[:,'HMean'] = (df_records.loc[:,'SR'] * df_records.loc[:,'Nov'] * df_records.loc[:,'Div']) ** 0.333

print(df_records.groupby('Model').mean(numeric_only=True))
print(df_records.groupby('Model').std(numeric_only=True))

100%|██████████| 10/10 [00:07<00:00,  1.35it/s]

            SR       Nov       Div     HMean
Model                                       
gsk3   0.97444  0.999818  0.712734  0.885625
            SR       Nov       Div     HMean
Model                                       
gsk3   0.00211  0.000577  0.006564  0.002677





## jnk3

In [10]:
frames = []

for modelname in ['jnk3']:
    
    for filename in filenames:
        
        filepath = os.path.join(input_dir, modelname, filename)
        
        df = pd.read_csv(filepath)
        
        df = df.loc[:,('smiles', 'gsk3b', 'jnk3', 'qed', 'sa')]
        df.loc[:,'model'] = modelname
        df.loc[:,'checkpoint'] = filename.split('.')[-1]
        
        frames.append(df)
        
        
data = []

for df in tqdm.tqdm(frames):

    s_sr, s_nov, s_div = evaluate_sr_nov_div(df, referece_smiles_iter, 'jnk3')
    
    data.append({'SR':s_sr, 'Nov':s_nov, 'Div':s_div, 'Model':df.loc[0,'model'], 'Ckpt':df.loc[0,'checkpoint']})
    

df_records = pd.DataFrame(data)
df_records.loc[:,'HMean'] = (df_records.loc[:,'SR'] * df_records.loc[:,'Nov'] * df_records.loc[:,'Div']) ** 0.333

print(df_records.groupby('Model').mean(numeric_only=True))
print(df_records.groupby('Model').std(numeric_only=True))

100%|██████████| 10/10 [01:06<00:00,  6.62s/it]

            SR       Nov       Div     HMean
Model                                       
jnk3   0.95152  0.906967  0.759858  0.868907
            SR       Nov       Div     HMean
Model                                       
jnk3   0.00308  0.004341  0.001091  0.001672



