In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
from rdkit.Chem import MolFromSmiles, Lipinski # NumHDonors, NumHAcceptors

In [2]:
## generated by ReBADD-SE
filepath_rebadd = os.path.join('outputs_7_calculate_properties_generated', 'frag+reinforce+scst+offpolicy', 'smi_after.csv.0')
df_rebadd = pd.read_csv(filepath_rebadd).loc[:,('smiles', 'bcl2', 'bclxl', 'bclw', 'mw', 'logp')].drop_duplicates(ignore_index=True)

In [3]:
## generated by RationaleRL
filepath_rationale = os.path.join('baseline', 'RationaleRL', 'smi_after.csv.0')
df_rationale = pd.read_csv(filepath_rationale).loc[:,('smiles', 'bcl2', 'bclxl', 'bclw', 'mw', 'logp')].drop_duplicates(ignore_index=True)

In [4]:
## generated by MARS
filepath_mars = os.path.join('baseline', 'MARS', 'smi_after.csv.0')
df_mars = pd.read_csv(filepath_mars).loc[:,('smiles', 'bcl2', 'bclxl', 'bclw', 'mw', 'logp')].drop_duplicates(ignore_index=True)

In [5]:
## generated by ReLeaSE
filepath_release = os.path.join('baseline', 'ReLeaSE', 'smi_after.csv.0')
df_release = pd.read_csv(filepath_release).loc[:,('smiles', 'bcl2', 'bclxl', 'bclw', 'mw', 'logp')].drop_duplicates(ignore_index=True)

In [6]:
## generated by MolGPT
filepath_molgpt = os.path.join('baseline', 'MolGPT', 'smi_after.csv.0')
df_molgpt = pd.read_csv(filepath_molgpt).loc[:,('smiles', 'bcl2', 'bclxl', 'bclw', 'mw', 'logp')].drop_duplicates(ignore_index=True)

In [7]:
def eval_ro5(df):
    
    records = []
    
    for i, smi in tqdm.tqdm(enumerate(df.loc[:,'smiles'].values), total=len(df)):

        mol = MolFromSmiles(smi)

        if mol is not None:

            mwt = df.loc[i,'mw']
            clogp = df.loc[i,'logp']
            num_H_donor = Lipinski.NumHDonors(mol)
            num_H_acceptor = Lipinski.NumHAcceptors(mol)

            flag_mwt = 0 if mwt < 500 else 1
            flag_clogp = 1 if clogp > 5 else 0
            flag_donor = 0 if num_H_donor < 5 else 1
            flag_acceptor = 0 if num_H_acceptor < 10 else 1

            cnt_violation = flag_mwt + flag_clogp + flag_donor + flag_acceptor

            records.append({'smiles':smi,
                            'mwt':mwt,
                            'clogp':clogp,
                            'donor':num_H_donor,
                            'acceptor':num_H_acceptor,
                            'violation_count':cnt_violation,
                            'flag_mwt':flag_mwt,
                            'flag_clogp':flag_clogp,
                            'flag_donor':flag_donor,
                            'flag_acceptor':flag_acceptor,
                           })
        
    return pd.DataFrame.from_dict(records)

In [8]:
df_rebadd = eval_ro5(df_rebadd)
df_rationale = eval_ro5(df_rationale)
df_mars = eval_ro5(df_mars)
df_release = eval_ro5(df_release)
df_molgpt = eval_ro5(df_molgpt)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 175/175 [00:00<00:00, 3329.73it/s]
 49%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 2456/5000 [00:00<00:00, 3089.83it/s][17:41:48] SMILES Parse Error: syntax error while parsing: None
[17:41:48] SMILES Parse Error: Failed parsing SMILES 'None' for input: 'None'
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 3082.02it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

[17:41:49] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 13
[17:41:49] SMILES Parse Error: unclosed ring for input: 'CCCNC(=O)C(C)N(Cc1ccc(Cl)c(Cl)c1)C(=O)CN(c1cc(Cl)cc1Cl)ccc1Cl'
 48%|██████████████████████████████████████████████████████████████████████████████████████████                                                                                                | 1319/2724 [00:00<00:00, 6599.24it/s][17:41:49] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 13 14 15
[17:41:49] SMILES Parse Error: extra close parentheses while parsing: CCCNC(=O)c1ccccc1C(F)(F)F)Nc1ccccc1CN1CCN(C(C)=O)CC1
[17:41:49] SMILES Parse Error: Failed parsing SMILES 'CCCNC(=O)c1ccccc1C(F)(F)F)Nc1ccccc1CN1CCN(C(C)=O)CC1' for input: 'CCCNC(=O)c1ccccc1C(F)(F)F)Nc1ccccc1CN1CCN(C(C)=O)CC1'
[17:41:49] SMILES Parse Error: extra open parentheses for input: 'CCC(CNC(=O)CN(c1cc(Cl)ccc1C)S(=O)(=O)c1ccccc1'
[17:41:49] SMILES Parse Error: extra open parentheses for input: 'CNC(=O)C(C'
[17:41:49] Explicit valence f

 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                  | 1992/2724 [00:00<00:00, 6657.52it/s][17:41:49] Explicit valence for atom # 6 N, 4, is greater than permitted
[17:41:49] SMILES Parse Error: unclosed ring for input: 'O=C(Nc1cc(Cl)ccc1NC(=O)CN(c1cc(Cl)ccc1Cl)S(C)(=O)=O)c1'
[17:41:49] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 20 22 24
[17:41:49] SMILES Parse Error: extra close parentheses while parsing: CCNC(=O)C(C)N(Cc1ccc(Cl)cc1Cl)C(=O)CN(c1cc(Cl)ccc1Cl)S(C)(=O)=O)c1
[17:41:49] SMILES Parse Error: Failed parsing SMILES 'CCNC(=O)C(C)N(Cc1ccc(Cl)cc1Cl)C(=O)CN(c1cc(Cl)ccc1Cl)S(C)(=O)=O)c1' for input: 'CCNC(=O)C(C)N(Cc1ccc(Cl)cc1Cl)C(=O)CN(c1cc(Cl)ccc1Cl)S(C)(=O)=O)c1'
[17:41:49] SMILES Parse Error: extra close parentheses while parsing: CCNC(=O)C(C)N(Cc1ccccc1F)C(=O)CN(c1cc(Cl)ccc1C)S(C)(=O)=O)c1ccc(Cl)cc1
[17:41:49] SMILES Parse Error: Fa

[17:41:49] SMILES Parse Error: extra close parentheses while parsing: CNC(=O)C(Cc1ccccc1)N(Cc1ccc(Cl)cc1Cl)S(=O)(=O)c1ccc(Cl)cc1)CCCC
[17:41:49] SMILES Parse Error: Failed parsing SMILES 'CNC(=O)C(Cc1ccccc1)N(Cc1ccc(Cl)cc1Cl)S(=O)(=O)c1ccc(Cl)cc1)CCCC' for input: 'CNC(=O)C(Cc1ccccc1)N(Cc1ccc(Cl)cc1Cl)S(=O)(=O)c1ccc(Cl)cc1)CCCC'
[17:41:49] Can't kekulize mol.  Unkekulized atoms: 23 24 25 27 28
[17:41:49] SMILES Parse Error: extra close parentheses while parsing: CCNC(=O)C(C)N(Cc1c(Cl)cccc1Cl)C(=O)CN(c1cc(Cl)cc(Cl)c1C)S(C)(=O)=O)C
[17:41:49] SMILES Parse Error: Failed parsing SMILES 'CCNC(=O)C(C)N(Cc1c(Cl)cccc1Cl)C(=O)CN(c1cc(Cl)cc(Cl)c1C)S(C)(=O)=O)C' for input: 'CCNC(=O)C(C)N(Cc1c(Cl)cccc1Cl)C(=O)CN(c1cc(Cl)cc(Cl)c1C)S(C)(=O)=O)C'
[17:41:49] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 28 29 30
[17:41:49] SMILES Parse Error: extra open parentheses for input: 'CCCCN(CCNC(=O)COc1cc(Cl)ccc1OC'
[17:41:49] Can't kekulize mol.  Unkekulized atoms: 9 10 12 13 14 15 16
[17:41:49] SMILES Pars

[17:41:50] SMILES Parse Error: unclosed ring for input: 'CC(Cn1c(SCCn2c(SC(C)C)nnc2C(C)N2CCC(C)CC2)nnc1=O)n1c(=O)[nH]c(=O)c2c1'
[17:41:50] SMILES Parse Error: unclosed ring for input: 'CC(=O)OC1CCC2(C)C3CC=C4C(CC(O)CC5(C)C4CC(OC(C)=O)C3(C)C(O)CC4)C2C1(C)C'
[17:41:50] SMILES Parse Error: syntax error while parsing: CCCC(c1cnc2nc((C)nn2c1)C(NC(=O)CN1CCN(C(=O)CCc2c(C)nc3nc(O)nn3c2C)CC1)C(F)(F)F
[17:41:50] SMILES Parse Error: Failed parsing SMILES 'CCCC(c1cnc2nc((C)nn2c1)C(NC(=O)CN1CCN(C(=O)CCc2c(C)nc3nc(O)nn3c2C)CC1)C(F)(F)F' for input: 'CCCC(c1cnc2nc((C)nn2c1)C(NC(=O)CN1CCN(C(=O)CCc2c(C)nc3nc(O)nn3c2C)CC1)C(F)(F)F'
[17:41:50] SMILES Parse Error: extra close parentheses while parsing: CCC(C)Br)c1ccc(OCC)c(NC(=O)CN2C(=O)SC(=Cc3cccc(OCc4ccc(Cl)cc4Cl)c3)C2=O)c1
[17:41:50] SMILES Parse Error: Failed parsing SMILES 'CCC(C)Br)c1ccc(OCC)c(NC(=O)CN2C(=O)SC(=Cc3cccc(OCc4ccc(Cl)cc4Cl)c3)C2=O)c1' for input: 'CCC(C)Br)c1ccc(OCC)c(NC(=O)CN2C(=O)SC(=Cc3cccc(OCc4ccc(Cl)cc4Cl)c3)C2=O)c1'
[17:41:50] SMILE

 45%|███████████████████████████████████████████████████████████████████████████████████▋                                                                                                      | 2066/4591 [00:00<00:00, 4715.01it/s][17:41:50] SMILES Parse Error: extra close parentheses while parsing: COCCCNC(=O)C(Cc1ccccc1)N(Cc1ccc(F)cc1)C(=O)COc1ccc(S(=O)(=O)Nc2cccF)cc2)cc1
[17:41:50] SMILES Parse Error: Failed parsing SMILES 'COCCCNC(=O)C(Cc1ccccc1)N(Cc1ccc(F)cc1)C(=O)COc1ccc(S(=O)(=O)Nc2cccF)cc2)cc1' for input: 'COCCCNC(=O)C(Cc1ccccc1)N(Cc1ccc(F)cc1)C(=O)COc1ccc(S(=O)(=O)Nc2cccF)cc2)cc1'
[17:41:50] SMILES Parse Error: syntax error while parsing: COCCCN(C(=O)CSc1nnc2ccc(S((=O)(=O)N(C)C)cn12)c1c(O)nc(=O)n(CC(C)C)c1N
[17:41:50] SMILES Parse Error: Failed parsing SMILES 'COCCCN(C(=O)CSc1nnc2ccc(S((=O)(=O)N(C)C)cn12)c1c(O)nc(=O)n(CC(C)C)c1N' for input: 'COCCCN(C(=O)CSc1nnc2ccc(S((=O)(=O)N(C)C)cn12)c1c(O)nc(=O)n(CC(C)C)c1N'
[17:41:50] Can't kekulize mol.  Unkekulized atoms: 8 9 10 40 41
[17:

[17:41:50] SMILES Parse Error: unclosed ring for input: 'Cc1cc2cc(C(c3nnnn3C3CCCCC3)N3CCN(c4cc(Cl)ccc4C)CC3)c(=O)[nH]c2cc12'
[17:41:50] Can't kekulize mol.  Unkekulized atoms: 4 5 7 8 9 10 11 12 14
[17:41:50] SMILES Parse Error: extra close parentheses while parsing: CC(C)(C)c1ccc(S(=O)(=O)N(C(=O)c2ccc(Cl)cc2)c2ccc3ocC(O)(CO)CO)cc3c2)cc1
[17:41:50] SMILES Parse Error: Failed parsing SMILES 'CC(C)(C)c1ccc(S(=O)(=O)N(C(=O)c2ccc(Cl)cc2)c2ccc3ocC(O)(CO)CO)cc3c2)cc1' for input: 'CC(C)(C)c1ccc(S(=O)(=O)N(C(=O)c2ccc(Cl)cc2)c2ccc3ocC(O)(CO)CO)cc3c2)cc1'
[17:41:50] SMILES Parse Error: unclosed ring for input: 'CS(=O)(=O)N(C)CCCC1C(=O)NCCCCN(C(=O)c2cc[nH]n2)CC(=O)N1CCOc1ccccc1C(=O)N1CCOC'
[17:41:50] SMILES Parse Error: extra close parentheses while parsing: COc1cc(Cl)cc2c1NCCC1C2)N(C)CC2c1ccc(F)cc1NS(=O)(=O)c1cc(C(=O)N3c4ccc(Cl)cc4CC4CN(C)CCC34)ccc1Cl
[17:41:50] SMILES Parse Error: Failed parsing SMILES 'COc1cc(Cl)cc2c1NCCC1C2)N(C)CC2c1ccc(F)cc1NS(=O)(=O)c1cc(C(=O)N3c4ccc(Cl)cc4CC4CN(C)CCC34)ccc

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4591/4591 [00:00<00:00, 4954.21it/s]


In [9]:
df_rebadd.loc[:,'violation_count'].value_counts() / len(df_rebadd)

3    0.468571
2    0.451429
4    0.080000
Name: violation_count, dtype: float64

In [10]:
df_rationale.loc[:,'violation_count'].value_counts() / len(df_rationale)

3    0.502901
2    0.425085
4    0.069014
1    0.003001
Name: violation_count, dtype: float64

In [11]:
df_mars.loc[:,'violation_count'].value_counts() / len(df_mars)

2    0.566540
1    0.230989
3    0.161597
0    0.035171
4    0.005703
Name: violation_count, dtype: float64

In [12]:
df_release.loc[:,'violation_count'].value_counts() / len(df_release)

1    0.427896
2    0.294326
0    0.277778
Name: violation_count, dtype: float64

In [13]:
df_molgpt.loc[:,'violation_count'].value_counts() / len(df_molgpt)

2    0.608892
1    0.323629
0    0.035206
3    0.029339
4    0.002934
Name: violation_count, dtype: float64

In [14]:
df_vscore = pd.DataFrame({
    'ZINC15':[0, 0.5, 0.333, 0.167, 0],
    'ReBADD-SE':[0, 0, 0.451, 0.469, 0.08],
    'RationaleRL':[0, 0.003, 0.425, 0.503, 0.069],
    'MARS':[0.035, 0.231, 0.567, 0.162, 0.006],
    'ReLeaSE':[0.278, 0.428, 0.294, 0, 0],
    'MolGPT':[0.035, 0.324, 0.609, 0.029, 0.003],
})

In [18]:
df_vscore.T

Unnamed: 0,0,1,2,3,4
ZINC15,0.0,0.5,0.333,0.167,0.0
ReBADD-SE,0.0,0.0,0.451,0.468,0.08
RationaleRL,0.0,0.003,0.425,0.503,0.069
MARS,0.035,0.231,0.567,0.162,0.006
ReLeaSE,0.278,0.428,0.294,0.0,0.0
MolGPT,0.035,0.324,0.609,0.029,0.003


In [16]:
pd.concat((
    df_rebadd.loc[:,('flag_mwt', 'flag_clogp', 'flag_donor', 'flag_acceptor')].mean().to_frame(name='ReBADD-SE'),
    df_rationale.loc[:,('flag_mwt', 'flag_clogp', 'flag_donor', 'flag_acceptor')].mean().to_frame(name='RationaleRL'),
    df_mars.loc[:,('flag_mwt', 'flag_clogp', 'flag_donor', 'flag_acceptor')].mean().to_frame(name='MARS'),
    df_release.loc[:,('flag_mwt', 'flag_clogp', 'flag_donor', 'flag_acceptor')].mean().to_frame(name='ReLeaSE'),
    df_molgpt.loc[:,('flag_mwt', 'flag_clogp', 'flag_donor', 'flag_acceptor')].mean().to_frame(name='MolGPT'),
), axis=1)


Unnamed: 0,ReBADD-SE,RationaleRL,MARS,ReLeaSE,MolGPT
flag_mwt,1.0,0.9998,0.923004,0.678487,0.938614
flag_clogp,1.0,0.994799,0.438213,0.338061,0.600316
flag_donor,0.08,0.092819,0.176806,0.0,0.020086
flag_acceptor,0.548571,0.55051,0.33365,0.0,0.082148


In [17]:
df_rebadd

Unnamed: 0,smiles,mwt,clogp,donor,acceptor,violation_count,flag_mwt,flag_clogp,flag_donor,flag_acceptor
0,O=C(NC(=S)Nc1ccc2oc(C=Cc3c(Br)ccc(Br)c3C3C=CCC...,784.970992,8.0457,2,8,2,1,1,0,0
1,O=C(NC(=S)Nc1ccc2oc(C=Cc3c(Br)ccc(Br)c3C3C=CCC...,782.991727,9.1994,2,7,2,1,1,0,0
2,O=C(NC(=S)Nc1ccc2oc(C=Cc3c(Br)ccc(Br)c3C3C=CCC...,788.865770,10.8677,2,5,2,1,1,0,0
3,O=C(NC(=S)Nc1ccc2oc(C=Cc3c(Cl)ccc(Br)c3C3C=CCC...,707.060480,7.7193,2,8,2,1,1,0,0
4,O=C(NC(=S)Nc1ccc2oc(C=Cc3c(Br)ccc(Br)c3C3C=CCC...,751.009964,7.8284,2,8,2,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...
170,O=C(NC(=S)Nc1ccc2oc(C=Cc3c(Br)ccc(Br)c3C3C=CCC...,1477.157074,20.6435,2,11,3,1,1,0,1
171,O=C(NC(=S)Nc1ccc2oc(C=CC=C(Br)C=Cc3c(Br)ccc(Br...,898.942355,9.6114,2,8,2,1,1,0,0
172,O=C(NC(=S)Nc1ccc2oc(C=Cc3c(Br)ccc(Br)c3C3C=CCC...,1039.092919,10.9485,2,10,3,1,1,0,1
173,O=C(NC(=S)Nc1ccc2oc(C=Cc3c(Br)ccc(Br)c3C3C=CCC...,860.062728,7.8668,3,10,3,1,1,0,1
