In [1]:
import os
import sys
import tqdm
import pandas as pd

In [2]:
ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path = sys.path if ROOT_PATH in sys.path else [ROOT_PATH] + sys.path

In [3]:
from COMA.properties import similarity

In [4]:
PROPERTY_NAME = 'abcg2'

# 1. Read data

In [5]:
filepath_tr_trip = os.path.join(PROPERTY_NAME, 'rdkit_train_triplet.txt')
filepath_tr_pair = os.path.join(PROPERTY_NAME, 'rdkit_train_pairs_full.csv')
filepath_va = os.path.join(PROPERTY_NAME, 'rdkit_valid_full.csv')
filepath_te = os.path.join(PROPERTY_NAME, 'rdkit_test_full.csv')

In [6]:
df_tr_trip = pd.read_csv(filepath_tr_trip, header=None, sep=' ')
df_tr_pair = pd.read_csv(filepath_tr_pair)
df_va = pd.read_csv(filepath_va)
df_te = pd.read_csv(filepath_te)

In [7]:
df_tr_src = df_tr_pair.loc[:,['SMILES_src', 'Q9UNQ0_src']].drop_duplicates()
df_tr_tar = df_tr_pair.loc[:,['SMILES_tar', 'Q9UNQ0_tar']].drop_duplicates()

In [8]:
print(f"Number of Triplets  : {df_tr_trip.shape}")
print(f"Number of Pairs     : {df_tr_pair.shape}")
print(f"Number of Sources   : {df_tr_src.shape}")
print(f"Number of Targets   : {df_tr_tar.shape}")
print(f"Number of Negatives : {df_tr_trip.iloc[:,2].drop_duplicates().shape}")

Number of Triplets  : (4612380, 3)
Number of Pairs     : (230619, 7)
Number of Sources   : (13840, 2)
Number of Targets   : (2340, 2)
Number of Negatives : (16180,)


# 2. Property
- Q9UNQ0 : ABCG2

In [9]:
print(f"Range of {PROPERTY_NAME} (src): {df_tr_src.iloc[:,1].min():.2f} ~ {df_tr_src.iloc[:,1].max():.2f}")
print(f"Range of {PROPERTY_NAME} (tar): {df_tr_tar.iloc[:,1].min():.2f} ~ {df_tr_tar.iloc[:,1].max():.2f}")

Range of abcg2 (src): 4.90 ~ 8.37
Range of abcg2 (tar): 3.40 ~ 4.70


In [10]:
print(f"Average of {PROPERTY_NAME} (src): {df_tr_src.iloc[:,1].mean():.2f} ({df_tr_src.iloc[:,1].std():.2f})")
print(f"Average of {PROPERTY_NAME} (tar): {df_tr_tar.iloc[:,1].mean():.2f} ({df_tr_tar.iloc[:,1].std():.2f})")

Average of abcg2 (src): 5.63 (0.57)
Average of abcg2 (tar): 4.34 (0.22)


# 3. Similarity & Improvement

In [11]:
df_tr_pair

Unnamed: 0,SMILES_src,SMILES_tar,P15056_src,P15056_tar,Q9UNQ0_src,Q9UNQ0_tar,TANIMOTO
0,CC1=CC=C(C(=O)NC2=CC=C(Br)C=C2)C=C1S(=O)(=O)NC...,O=C(NC1=CC=C2C=CC=NC2=C1)C1=CC=C(Br)C=C1,6.891558,6.526552,5.380080,4.581529,0.440678
1,CC1=CC=C(C(=O)NC2=CC=CC(C(F)(F)F)=C2)C=C1NC(=O...,O=C(NC1=CC=CC(C(F)(F)F)=C1)C1=CC=CC(NC(=O)C2=C...,6.346293,6.541248,5.036449,4.431240,0.455882
2,O=C(NC1=CC=C(Br)C=C1)C1=CC=C(C(F)(F)P(=O)(O)O)...,O=C(NC1=CC=C(Cl)C=C1)C1=CC=CC=C1,6.936429,6.513035,6.975090,3.876176,0.414634
3,O=C(NC1=CC=CC=C1)C1=CC=C(C(F)(F)P(=O)(O)O)C=C1,O=C(NC1=CC=CC=C1)C1=CC=CC=N1,6.985357,6.564551,6.571701,4.124291,0.410256
4,CC1=CC=C(NC(=O)C2=CC=CC(C(F)(F)F)=C2)C=C1C(=O)...,O=C(NC1=CC=CC(OC2CCCN(C(=O)C3=CC=CN=C3)C2)=C1)...,6.323243,6.250835,5.237891,4.411502,0.412500
...,...,...,...,...,...,...,...
230614,CCC1=CC=C(NC(=O)NC2=CC=C3C=CC=CC3=C2)C=C1,O=C(NC1=CC=C2C=CC=CC2=C1)C1=CC=CC=C1SSC1=CC=CC...,6.213083,6.280292,4.956615,4.374473,0.454545
230615,O=C(NC1=CC=C(OC(F)(F)Cl)C=C1)C1=CN=C(N2CC(O)C2...,NC1CN(C2=CC=C(C(=O)NC3=CC=C(OC(F)(F)Cl)C=C3)C=...,6.811960,6.630232,5.450017,4.665955,0.640000
230616,O=C(NCC1=CC=CC=C1)NC1=CC=C2C=NC=CC2=C1,COC1=CC=C(NC(=O)NCC2=CC=CN=C2)C(OC)=C1,6.998187,6.272488,5.178164,4.333024,0.431034
230617,O=C(NC1=CC(O)=NC(O)=N1)C1=CC=CC(C(F)(F)F)=C1,COC(=O)C1=CC=CC(NC(=O)C2=CC=CC(C(F)(F)F)=C2)=C1N,6.241566,6.441089,4.939745,4.039503,0.454545


In [12]:
df_tr_pair_ = df_tr_pair.copy()
df_tr_pair_.loc[:,'IMPROVEMENT'] = df_tr_pair_.loc[:,'Q9UNQ0_tar'] - df_tr_pair_.loc[:,'Q9UNQ0_src']

In [13]:
print(f"Range of Similarity (src,tar): {df_tr_pair_.loc[:,'TANIMOTO'].min():.2f} ~ {df_tr_pair_.loc[:,'TANIMOTO'].max():.2f}")
print(f"Range of Improvement (tar-src): {df_tr_pair_.loc[:,'IMPROVEMENT'].min():.2f} ~ {df_tr_pair_.loc[:,'IMPROVEMENT'].max():.2f}")

Range of Similarity (src,tar): 0.40 ~ 1.00
Range of Improvement (tar-src): -4.85 ~ -0.20


In [14]:
print(f"Average of Similarity (src,tar) : {df_tr_pair_.loc[:,'TANIMOTO'].mean():.2f} ({df_tr_pair_.loc[:,'TANIMOTO'].std():.2f})")
print(f"Average of Improvement (tar-src): {df_tr_pair_.loc[:,'IMPROVEMENT'].mean():.2f} ({df_tr_pair_.loc[:,'IMPROVEMENT'].std():.2f})")

Average of Similarity (src,tar) : 0.46 (0.06)
Average of Improvement (tar-src): -1.15 (0.56)


In [15]:
df_tr_trip_ = df_tr_trip.copy()
df_tr_trip_.loc[:,3] = 0.
df_tr_trip_.loc[:,4] = 0.

for i in tqdm.trange(len(df_tr_trip_)):
    smi_src = df_tr_trip_.iloc[i,0]
    smi_tar = df_tr_trip_.iloc[i,1]
    smi_neg = df_tr_trip_.iloc[i,2]
    ## similarity
    sim_1 = similarity(smi_src, smi_neg)
    sim_2 = similarity(smi_tar, smi_neg)
    df_tr_trip_.iloc[i,3] = sim_1
    df_tr_trip_.iloc[i,4] = sim_2

100%|██████████| 4612380/4612380 [26:22:01<00:00, 48.59it/s]   


In [16]:
print(f"Range of Similarity (src,neg): {df_tr_trip_.iloc[:,3].min():.2f} ~ {df_tr_trip_.iloc[:,3].max():.2f}")
print(f"Range of Similarity (tar,neg): {df_tr_trip_.iloc[:,4].min():.2f} ~ {df_tr_trip_.iloc[:,4].max():.2f}")

Range of Similarity (src,neg): 0.04 ~ 0.30
Range of Similarity (tar,neg): 0.04 ~ 0.30
