In [1]:
import os
import sys
import tqdm
import pandas as pd

In [2]:
MTMR_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path = sys.path if MTMR_PATH in sys.path else [MTMR_PATH] + sys.path

In [3]:
from MTMR.properties import drd2, qed, penalized_logp, similarity

In [4]:
PROPERTY_NAME = 'logp04'
SCORING_FT = penalized_logp

# 1. Read data

In [5]:
filepath_tr_trip = os.path.join(PROPERTY_NAME, 'rdkit_train_triplet.txt')
filepath_tr_pair = os.path.join(PROPERTY_NAME, 'rdkit_train_pairs.txt')
filepath_tr_src = os.path.join(PROPERTY_NAME, 'rdkit_train_src.txt')
filepath_tr_tar = os.path.join(PROPERTY_NAME, 'rdkit_train_tar.txt')
filepath_va = os.path.join(PROPERTY_NAME, 'rdkit_valid.txt')
filepath_te = os.path.join(PROPERTY_NAME, 'rdkit_test.txt')

In [6]:
df_tr_trip = pd.read_csv(filepath_tr_trip, header=None, sep=' ')
df_tr_pair = pd.read_csv(filepath_tr_pair, header=None, sep=' ')
df_tr_src = pd.read_csv(filepath_tr_src, header=None).drop_duplicates(ignore_index=True)
df_tr_tar = pd.read_csv(filepath_tr_tar, header=None).drop_duplicates(ignore_index=True)
df_va = pd.read_csv(filepath_va, header=None)
df_te = pd.read_csv(filepath_te, header=None)

In [7]:
print(f"Number of Triplets  : {df_tr_trip.shape}")
print(f"Number of Pairs     : {df_tr_pair.shape}")
print(f"Number of Sources   : {df_tr_src.shape}")
print(f"Number of Targets   : {df_tr_tar.shape}")
print(f"Number of Negatives : {df_tr_trip.iloc[:,2].drop_duplicates().shape}")

Number of Triplets  : (1973800, 3)
Number of Pairs     : (98690, 2)
Number of Sources   : (57856, 1)
Number of Targets   : (44759, 1)
Number of Negatives : (99066,)


# 2. Property

In [8]:
df_tr_src_ = df_tr_src.copy()
df_tr_src_.loc[:,1] = 0.

for i in tqdm.trange(len(df_tr_src_)):
    smi = df_tr_src_.iloc[i,0]
    prop = SCORING_FT(smi)
    df_tr_src_.iloc[i,1] = prop

100%|██████████| 57856/57856 [00:57<00:00, 1013.24it/s]


In [9]:
df_tr_tar_ = df_tr_tar.copy()
df_tr_tar_.loc[:,1] = 0.

for i in tqdm.trange(len(df_tr_tar_)):
    smi = df_tr_tar_.iloc[i,0]
    prop = SCORING_FT(smi)
    df_tr_tar_.iloc[i,1] = prop

100%|██████████| 44759/44759 [00:45<00:00, 985.71it/s] 


In [10]:
print(f"Range of {PROPERTY_NAME} (src): {df_tr_src_.iloc[:,1].min():.2f} ~ {df_tr_src_.iloc[:,1].max():.2f}")
print(f"Range of {PROPERTY_NAME} (tar): {df_tr_tar_.iloc[:,1].min():.2f} ~ {df_tr_tar_.iloc[:,1].max():.2f}")

Range of logp04 (src): -62.52 ~ 1.66
Range of logp04 (tar): -42.76 ~ 4.17


In [11]:
print(f"Average of {PROPERTY_NAME} (src): {df_tr_src_.iloc[:,1].mean():.2f} ({df_tr_src_.iloc[:,1].std():.2f})")
print(f"Average of {PROPERTY_NAME} (tar): {df_tr_tar_.iloc[:,1].mean():.2f} ({df_tr_tar_.iloc[:,1].std():.2f})")

Average of logp04 (src): -2.02 (2.05)
Average of logp04 (tar): 1.22 (1.48)


# 3. Similarity & Improvement

In [12]:
df_tr_pair_ = df_tr_pair.copy()
df_tr_pair_.loc[:,2] = 0.
df_tr_pair_.loc[:,3] = 0.

for i in tqdm.trange(len(df_tr_pair_)):
    smi_src = df_tr_pair_.iloc[i,0]
    smi_tar = df_tr_pair_.iloc[i,1]
    ## similarity
    sim = similarity(smi_src, smi_tar)
    df_tr_pair_.iloc[i,2] = sim
    ## improvement
    prop_src = SCORING_FT(smi_src)
    prop_tar = SCORING_FT(smi_tar)
    df_tr_pair_.iloc[i,3] = prop_tar - prop_src

100%|██████████| 98690/98690 [03:40<00:00, 448.39it/s]


In [13]:
print(f"Range of Similarity (src,tar) : {df_tr_pair_.iloc[:,2].min():.2f} ~ {df_tr_pair_.iloc[:,2].max():.2f}")
print(f"Range of Improvement (tar-src): {df_tr_pair_.iloc[:,3].min():.2f} ~ {df_tr_pair_.iloc[:,3].max():.2f}")

Range of Similarity (src,tar) : 0.40 ~ 1.00
Range of Improvement (tar-src): 1.00 ~ 64.36


In [14]:
print(f"Average of Similarity (src,tar) : {df_tr_pair_.iloc[:,2].mean():.2f} ({df_tr_pair_.iloc[:,2].std():.2f})")
print(f"Average of Improvement (tar-src): {df_tr_pair_.iloc[:,3].mean():.2f} ({df_tr_pair_.iloc[:,3].std():.2f})")

Average of Similarity (src,tar) : 0.43 (0.04)
Average of Improvement (tar-src): 3.77 (1.47)


In [15]:
df_tr_trip_ = df_tr_trip.copy()
df_tr_trip_.loc[:,3] = 0.
df_tr_trip_.loc[:,4] = 0.

for i in tqdm.trange(len(df_tr_trip_)):
    smi_src = df_tr_trip_.iloc[i,0]
    smi_tar = df_tr_trip_.iloc[i,1]
    smi_neg = df_tr_trip_.iloc[i,2]
    ## similarity
    sim_1 = similarity(smi_src, smi_neg)
    sim_2 = similarity(smi_tar, smi_neg)
    df_tr_trip_.iloc[i,3] = sim_1
    df_tr_trip_.iloc[i,4] = sim_2

100%|██████████| 1973800/1973800 [6:14:13<00:00, 87.90it/s]   


In [16]:
print(f"Range of Similarity (src,neg): {df_tr_trip_.iloc[:,3].min():.2f} ~ {df_tr_trip_.iloc[:,3].max():.2f}")
print(f"Range of Similarity (tar,neg): {df_tr_trip_.iloc[:,4].min():.2f} ~ {df_tr_trip_.iloc[:,4].max():.2f}")

Range of Similarity (src,neg): 0.00 ~ 0.30
Range of Similarity (tar,neg): 0.00 ~ 0.30
