In [1]:
import pandas as pd

In [2]:
!ls

[34mdrd2[m[m
drd2_success_rate.txt
get_JTNN_score_and_validity.ipynb
get_SMILES_validity.ipynb
[34mlogp04[m[m
[34mlogp06[m[m
[34mqed[m[m
qed_success_rate.txt
top_models.txt
validate_drd2_0to19-2021.04.13-21.37.05.log
validate_logp04_0to18-2021.04.13-21.36.42.log
validate_logp06_0to18-2021.04.13-21.36.42.log
validate_qed_0to6-2021.04.14-08.17.48.log
validate_qed_15to19-2021.04.14-08.20.03.log
validate_qed_7to14-2021.04.14-08.18.30.log


In [3]:
import glob, os
import numpy as np

In [4]:
cwd = os.getcwd()

In [65]:
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors



In [66]:
def similarity(a, b):
    if a is None or b is None:
        return 0.0
    amol = Chem.MolFromSmiles(a)
    bmol = Chem.MolFromSmiles(b)
    if amol is None or bmol is None:
        return 0.0

    fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=False)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=False)
    return DataStructs.TanimotoSimilarity(fp1, fp2)

def get_logp_results(paths, num_decode=20, delta=0.4):    
    cwd = os.getcwd()

    for d in paths:
        os.chdir(d)
        results_files = glob.glob('results.*')
        for f in results_files:
            df = pd.read_csv(f,sep='\s+',header=None)
            data = df.values.tolist()
            
            data = [(a,b,float(c),float(d)) for a,b,c,d in data]
            n_mols = len(data) / num_decode
            print(f, len(data), num_decode)
#             assert len(data) % num_decode == 0

            all_logp = []

            for i in range(0, len(data), num_decode):
                set_x = set([x[0] for x in data[i:i+num_decode]])
                assert len(set_x) == 1

                good = [(sim,logp) for _,_,sim,logp in data[i:i+num_decode] if 1 > sim >= delta]
                if len(good) > 0:
                    sim,logp = max(good, key=lambda x:x[1])
                    all_logp.append(max(0,logp))
                else:
                    all_logp.append(0.0) #No improvement

            assert len(all_logp) == n_mols
            all_logp = np.array(all_logp)

            print('Evaluated on %d samples' % (n_mols,))
            print('average improvement', np.mean(all_logp), 'stdev',  np.std(all_logp))
    os.chdir(cwd)
            
            
def get_drd2_results(paths, num_decode=20, sim_delta=0.4, prop_delta=0.5):    
    cwd = os.getcwd()
    for d in paths:
        os.chdir(d)
        results_files = glob.glob('results.*')
        for f in results_files:
            df = pd.read_csv(f,sep='\s+',header=None)
            data = df.values.tolist()
            
            data = [(a,b,float(c),float(d)) for a,b,c,d in data]
            n_mols = len(data) / num_decode

            n_succ = 0.0
            for i in range(0, len(data), num_decode):
                set_x = set([x[0] for x in data[i:i+num_decode]])
                assert len(set_x) == 1

                good = [(sim,prop) for _,_,sim,prop in data[i:i+num_decode] if 1 > sim >= sim_delta and prop >= prop_delta]
                if len(good) > 0:
                    n_succ += 1            
            print('Evaluated on %d samples' % (n_mols,))
            print('success rate', n_succ / n_mols)    
    os.chdir(cwd)
    
def get_qed_results(paths, num_decode=20, sim_delta=0.4, prop_delta=0.9):

    num_decode=20
    sim_delta=0.4
    prop_delta=0.9
    cwd = os.getcwd()
    for d in paths:
        os.chdir(d)
        results_files = glob.glob('results.*')
        for f in results_files:
            df = pd.read_csv(f,sep='\s+',header=None)
            data = df.values.tolist()
            data = [(a,b,float(c),float(d)) for a,b,c,d in data]
            n_mols = len(data) / num_decode

            n_succ = 0.0
            for i in range(0, len(data), num_decode):
                set_x = set([x[0] for x in data[i:i+num_decode]])
                assert len(set_x) == 1

                good = [(sim,prop) for _,_,sim,prop in data[i:i+num_decode] if 1 > sim >= sim_delta and prop >= prop_delta]
                if len(good) > 0:
                    n_succ += 1

            print('Evaluated on %d samples' % (n_mols,))
            print('success rate', n_succ / n_mols)
    os.chdir(cwd)

    
def get_diversity(paths, num_decode=20, sim_delta=0.4, prop_delta=0.9):

    cwd = os.getcwd()
    def convert(x):
        return None if x == "None" else x

    for d in paths:
        os.chdir(d)
        results_files = glob.glob('results.*')
        for f in results_files:
            df = pd.read_csv(f,sep='\s+',header=None)
            data = df.values.tolist()
            data = [(a,b,float(c),float(d)) for a,b,c,d in data]

            all_div = []
            n_succ = 0
            for i in range(0, len(data), num_decode):
                set_x = set([x[0] for x in data[i:i+num_decode]])
                assert len(set_x) == 1

                good = [convert(y) for x,y,sim,prop in data[i:i+num_decode] if sim >= sim_delta and prop >= prop_delta]
                if len(good) == 0:
                    continue

                good = list(set(good))
                if len(good) == 1:
                    all_div.append(0.0)
                    continue
                n_succ += 1

                div = 0.0
                tot = 0
                for i in range(len(good)):
                    for j in range(i + 1, len(good)):
                        div += 1 - similarity(good[i], good[j])
                        tot += 1
                div /= tot
                all_div.append(div)

            all_div = np.array(all_div)
            print(np.mean(all_div), np.std(all_div))
            print(n_succ)
    os.chdir(cwd)



## logp04 scores

In [67]:
get_logp_results([os.path.join(cwd,'logp04')])

results.2 4000 20
Evaluated on 200 samples
average improvement 3.560336237220065 stdev 1.885607712794035
results.16 4000 20
Evaluated on 200 samples
average improvement 3.423244601405205 stdev 1.729702222157412
results.5 4000 20
Evaluated on 200 samples
average improvement 3.61871194066513 stdev 1.6321087741044251
results.11 4000 20
Evaluated on 200 samples
average improvement 3.490569037162505 stdev 1.7942542423371872
results.18 4000 20
Evaluated on 200 samples
average improvement 3.46669967405636 stdev 1.7482271113165808
results.4 4000 20
Evaluated on 200 samples
average improvement 3.60850555570224 stdev 1.727601569378745
results.10 4000 20
Evaluated on 200 samples
average improvement 3.56351606688751 stdev 1.7395810194678487
results.3 4000 20
Evaluated on 200 samples
average improvement 3.5324827544183397 stdev 1.7361858658894036
results.6 4000 20
Evaluated on 200 samples
average improvement 3.616442508865325 stdev 1.7967831497953923
results.12 4000 20
Evaluated on 200 samples
aver

In [69]:
get_diversity([os.path.join(cwd,'logp04')],num_decode=20, sim_delta=0.4, prop_delta=0.0)

0.45409235528186015 0.14879691088279223
164
0.46718343099977233 0.1854204547296176
161
0.46233915051176316 0.17756062582746
168
0.49558326109843165 0.15338951846509394
169
0.4696896831715891 0.1878760195272722
160
0.48167357450316345 0.14156084667466964
172
0.5004109872091714 0.15198809525165166
169
0.47870891821616923 0.15950796008988694
168
0.49353443602536884 0.14250338171876117
170
0.47983870194040634 0.16503890699328644
165
0.4503169597230359 0.16890603231319973
155
0.4857085406747186 0.15510578386173135
162
0.4876480216392405 0.14781046187187522
170
0.49968447111561487 0.14457901224002284
175
0.38588521642135226 0.19805540397186533
123
0.4939680896047968 0.15277845837324755
166
0.4692529886935856 0.17496766126736957
159
0.49118079125504205 0.15153605214857097
168


## logp06 scores

In [62]:
get_logp_results([os.path.join(cwd,'logp06')], 0.6)

results.16 4000 20
Evaluated on 200 samples
average improvement 1.8930940864153252 stdev 1.2906545306314654
results.5 4000 20
Evaluated on 200 samples
average improvement 1.7851608937695633 stdev 1.2629175446705543
results.18 4000 20
Evaluated on 200 samples
average improvement 1.92432249357574 stdev 1.384533337571014
results.4 4000 20
Evaluated on 200 samples
average improvement 1.8008799435695202 stdev 1.2819482461190443
results.10 4000 20
Evaluated on 200 samples
average improvement 1.998550355695595 stdev 1.268869401973208
results.3 4000 20
Evaluated on 200 samples
average improvement 1.8108249833277599 stdev 1.3305384818357737
results.17 4000 20
Evaluated on 200 samples
average improvement 1.9227698931116248 stdev 1.3147693737941966
results.6 4000 20
Evaluated on 200 samples
average improvement 1.90332296849364 stdev 1.2705125160601134
results.12 4000 20
Evaluated on 200 samples
average improvement 1.92016980908509 stdev 1.3062149291391378
results.1 4000 20
Evaluated on 200 sample

In [70]:
get_diversity([os.path.join(cwd,'logp06')],num_decode=20, sim_delta=0.6, prop_delta=0.0)

0.25168514941377257 0.1832931137364872
123
0.27101918762625316 0.15427417958912037
136
0.27257415795461853 0.1813478345990083
124
0.2642336996597887 0.17843668290635384
123
0.247601106362365 0.19131773403323013
119
0.23624457192800016 0.18719627719520993
110
0.24499127255505102 0.1891386736374487
114
0.26018179107664885 0.17695667705194196
132
0.2689487809357834 0.17652166871506114
124
0.1892903252704556 0.1900743569686082
69
0.26892980435681846 0.18594994810277984
121
0.2476184536923631 0.19340915800702174
109
0.2803635589059849 0.17337712544190054
130
0.11186428969390562 0.16985398004496652
38
0.2666400044882213 0.19155890785828147
121
0.2523441397118569 0.18455541353629798
116
0.274245486801954 0.18590115910901384
122


# qed scores

In [71]:
get_qed_results([os.path.join(cwd,'qed')])

Evaluated on 360 samples
success rate 0.5777777777777777
Evaluated on 360 samples
success rate 0.5027777777777778
Evaluated on 360 samples
success rate 0.6083333333333333
Evaluated on 360 samples
success rate 0.5277777777777778
Evaluated on 360 samples
success rate 0.5111111111111111
Evaluated on 360 samples
success rate 0.5
Evaluated on 360 samples
success rate 0.5944444444444444
Evaluated on 360 samples
success rate 0.5444444444444444
Evaluated on 360 samples
success rate 0.6027777777777777
Evaluated on 360 samples
success rate 0.5416666666666666
Evaluated on 360 samples
success rate 0.5611111111111111
Evaluated on 360 samples
success rate 0.5194444444444445
Evaluated on 360 samples
success rate 0.5444444444444444
Evaluated on 360 samples
success rate 0.5055555555555555
Evaluated on 360 samples
success rate 0.55
Evaluated on 360 samples
success rate 0.55
Evaluated on 360 samples
success rate 0.4027777777777778
Evaluated on 360 samples
success rate 0.5305555555555556
Evaluated on 360 

In [72]:
get_diversity([os.path.join(cwd,'qed')],num_decode=20, sim_delta=0.4, prop_delta=0.9)

0.3693699166145171 0.2382925687386741
157
0.351322140155191 0.25713371668394164
125
0.3491488715107504 0.2517103234764283
149
0.3409880788558808 0.25364820918672276
128
0.35425021182387384 0.26426955862821927
124
0.3355919109573457 0.2599883023649115
120
0.36236626787638865 0.2439540246731356
157
0.36004178251310565 0.24642883568083546
139
0.36242311437885805 0.24074318374882428
157
0.2958011709828717 0.2713155056876681
112
0.3694900331004066 0.24301047873411238
148
0.3596168557787925 0.2474988555932498
133
0.3373504316736096 0.22678559255622743
142
0.3505161679188637 0.25705850861973834
125
0.3838622223974441 0.2441219628772513
149
0.3264114610997781 0.2552285153322981
129
0.27804366609220826 0.24038095641567836
90
0.34026608074206904 0.2646004933741437
125
0.3522970216990097 0.24528965432003896
145
0.3102846887717595 0.25705091202129454
124


## drd2 scores

In [64]:
get_drd2_results([os.path.join(cwd,'drd2')])

Evaluated on 500 samples
success rate 0.492
Evaluated on 500 samples
success rate 0.73
Evaluated on 500 samples
success rate 0.608
Evaluated on 500 samples
success rate 0.732
Evaluated on 500 samples
success rate 0.746
Evaluated on 500 samples
success rate 0.734
Evaluated on 500 samples
success rate 0.61
Evaluated on 500 samples
success rate 0.706
Evaluated on 500 samples
success rate 0.542
Evaluated on 500 samples
success rate 0.728
Evaluated on 500 samples
success rate 0.67
Evaluated on 500 samples
success rate 0.732
Evaluated on 500 samples
success rate 0.356
Evaluated on 500 samples
success rate 0.74
Evaluated on 500 samples
success rate 0.712
Evaluated on 500 samples
success rate 0.708
Evaluated on 500 samples
success rate 0.22
Evaluated on 500 samples
success rate 0.728
Evaluated on 500 samples
success rate 0.698
Evaluated on 500 samples
success rate 0.73


In [73]:
get_diversity([os.path.join(cwd,'drd2')],num_decode=20, sim_delta=0.4, prop_delta=0.5)

0.10578605910744862 0.19140946512740686
66
0.15936814158425788 0.23268096779203318
133
0.1553827314942292 0.22639243694876884
108
0.15050009807190248 0.22982144308058497
125
0.1496066199318858 0.22453694969196636
132
0.14605648116497025 0.22409086355879743
124
0.12775113373238503 0.22062235104185604
85
0.14704756105710304 0.22573055813828788
121
0.09828667057483881 0.19736037530550254
61
0.15468101117193708 0.23033962534989008
126
0.1293169036618492 0.21449004113805553
101
0.1550966514440852 0.23581943514494685
126
0.09035217974420905 0.17744633792166875
41
0.1498009035346191 0.22628817372976032
128
0.14254383044867264 0.22464629268170916
114
0.1366472309175545 0.22045623021648664
112
0.07349659498104709 0.14233083266808497
26
0.15135442184269363 0.23084911914585127
126
0.1387404811566794 0.22216687860214024
111
0.14911801806981778 0.2316284107985881
121
