# dataCleansing tid-20174

In [1]:
import pytest
import os,sys
import itertools
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt
from rdkit.Chem.Scaffolds import MurckoScaffold
#sys.path.append(os.path.join(os.path.dirname(__file__), '../src/'))
sys.path.append('../')
from sphg.infra.xmol import xmol
from sphg.common.graph import Graph
from sphg.common.unique import unique
from sphg.common.scaffold import Scaffold
from sphg.common import pickle0
#from infra.phcg import LPhcg
#from database.ligand import Ligand
import sphg.common.molwt as molwt

| TID    | CHEMBLID   | Name                           | Uniprot |
| -----  | ---------- | ------------------------------ | ------- |
| 8      | CHEMBL1862 | Tyrosine  kinase ABL1          | P00519  |
| 11     | CHEMBL204  | Thrombin                       | P00734  |
| 137    | CHEMBL237  | 𝜅-opioid  receptor             | P41145  |
| 11362  | CHEMBL4005 | PI3-kinase  p110-alpha subunit | N/A     |
| 20174  | N/A  | G protein-coupled receptor 44          | N/A     |
| 104219 | N/A  | Transmembrane protease serine 6      | N/A     |

In [2]:
tid=20174
tsvdir='../../data/sphg/tid-%d/'%tid
files=os.listdir(tsvdir)
files

['tid-20174-actives.txt', 'tid-20174-inactives.txt']

In [3]:
dfactives=pd.read_csv(tsvdir+'tid-%d-actives.txt'%tid,delimiter='\t')
dfinactives=pd.read_csv(tsvdir+'tid-%d-inactives.txt'%tid,delimiter='\t')

In [4]:
dfactives

Unnamed: 0,funatsu_lab_id,chembl-id,"pot.(log,Ki)","pot.(nMol,Ki)",aromatic_smiles,non_stereo_aromatic_smiles,all-chembl-ids,no.-meas.
0,1556435,CHEMBL3639847,10.00,0.1,Cc1c2cc(ccc2[nH]c1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)...,Cc1c2cc(ccc2[nH]c1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)...,CHEMBL3639847,2
1,1560497,CHEMBL3690164,10.00,0.1,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C...,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C...,CHEMBL3690164,2
2,1560498,CHEMBL3690165,10.00,0.1,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C)...,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C)...,CHEMBL3690165,2
3,1560510,CHEMBL3690177,10.00,0.1,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n4...,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n4...,CHEMBL3690177,2
4,1560511,CHEMBL3690178,10.00,0.1,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n...,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n...,CHEMBL3690178,2
...,...,...,...,...,...,...,...,...
855,259538,CHEMBL180732,4.77,17000.0,c1ccc(cc1)CNc2ccc3c(c2)c4c(n3CCC(=O)O)CCCC4,c1ccc(cc1)CNc2ccc3c(c2)c4c(n3CCC(=O)O)CCCC4,CHEMBL180732,1
856,259522,CHEMBL360147,4.74,18000.0,CCC(C(=O)O)n1c2ccc(cc2c3c1CCCC3)NS(=O)(=O)c4cc...,CCC(C(=O)O)n1c2ccc(cc2c3c1CCCC3)NS(=O)(=O)c4cc...,CHEMBL360147,1
857,259602,CHEMBL426240,4.72,19000.0,c1cc(ccc1F)S(=O)(=O)Nc2ccc3c(c2)c4c(n3/C=C\C(=...,c1cc(ccc1F)S(=O)(=O)Nc2ccc3c(c2)c4c(n3C=CC(=O)...,CHEMBL426240,1
858,259450,CHEMBL180098,4.57,27000.0,c1cc(cc(c1)n2c3ccc(cc3c4c2CCCC4)NS(=O)(=O)c5cc...,c1cc(cc(c1)n2c3ccc(cc3c4c2CCCC4)NS(=O)(=O)c5cc...,CHEMBL180098,1


In [5]:
dfinactives

Unnamed: 0,funatsu_lab_id,chembl-id,potencies,units,aromatic_smiles,non_stereo_aromatic_smiles,all-chembl-ids,activity-comment
0,308524,CHEMBL379639,"5200.0,None,None","IC50,Activity,Activity",Cc1c(c2ccccc2n1c3ccncn3)CC(=O)O,Cc1c(c2ccccc2n1c3ccncn3)CC(=O)O,CHEMBL379639,"[None, 'Partial agonist', 'Not Active']"
1,308464,CHEMBL378666,"21.0,None","IC50,Activity",Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4,Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4,CHEMBL378666,"[None, 'Not Active']"
2,308430,CHEMBL378094,"125.0,None","IC50,Activity",Cc1ccc2c(c1)n(c(c2c3ccnc4c3ccc(c4)Cl)C)CC(=O)O,Cc1ccc2c(c1)n(c(c2c3ccnc4c3ccc(c4)Cl)C)CC(=O)O,CHEMBL378094,"[None, 'Not Active']"
3,1246815,CHEMBL2385123,,IC50,Cc1c(c(nn1Cc2ccc(cc2)S(=O)(=O)C)c3ccccc3)CC(=O)O,Cc1c(c(nn1Cc2ccc(cc2)S(=O)(=O)C)c3ccccc3)CC(=O)O,CHEMBL2385123,['Not Active']
4,1246584,CHEMBL2385111,,IC50,Cc1c(c(nn1CC2CCCN2S(=O)(=O)c3ccccc3)c4ccccc4)C...,Cc1c(c(nn1CC2CCCN2S(=O)(=O)c3ccccc3)c4ccccc4)C...,CHEMBL2385111,['Not Active']
5,308280,CHEMBL378730,"2.3,None","IC50,Activity",Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4Cl,Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4Cl,CHEMBL378730,"[None, 'Not Active']"
6,308334,CHEMBL213495,"18.0,None","IC50,Activity",Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4C#N,Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4C#N,CHEMBL213495,"[None, 'Not Active']"
7,1283736,CHEMBL3099128,,IC50,Cc1c(c(nn1CC(=O)O)OC)Cc2ccccc2S(=O)(=O)c3ccccc3,Cc1c(c(nn1CC(=O)O)OC)Cc2ccccc2S(=O)(=O)c3ccccc3,CHEMBL3099128,['Not Active']
8,1381409,CHEMBL3338264,,IC50,Cc1ccc2c(cn(c2n1)c3ccc(cc3Cc4cc(nn4C)C)Cl)CC(=O)O,Cc1ccc2c(cn(c2n1)c3ccc(cc3Cc4cc(nn4C)C)Cl)CC(=O)O,CHEMBL3338264,['Not Active']
9,1381410,CHEMBL3338265,,IC50,Cc1ccc2c(cn(c2n1)c3ccccc3Cn4cc(nc4C)C)CC(=O)O,Cc1ccc2c(cn(c2n1)c3ccccc3Cn4cc(nc4C)C)CC(=O)O,CHEMBL3338265,['Not Active']


In [6]:
smileslist=list(dfactives['non_stereo_aromatic_smiles'])
activitiylist=list(dfactives['pot.(log,Ki)'])

In [7]:

nmol=len(smileslist)
    #for imol, (chembl, smiles, activity) in enumerate(zip(chembl_list,smiles_list, activity_list)):
    #    if imol==44:
def showMol(index):
    imol=index
    smiles=smileslist[imol]
    mol=Chem.MolFromSmiles(smiles)
    xmol0 = xmol(mol,molid=0)
    xmol0.activity=activitiylist[imol]
    xmol0.getChemicalFeats()
    xmol0.show(grid=False,num=False)
    print('### calc. %d-th/%d activity=%6.2f'%(imol,nmol,xmol0.activity))
    print('SMILES = %s'%smiles)
    print(xmol0.featsAtomListGraph)
    print(xmol0.featsFamilyList)

In [9]:
from ipywidgets import interact
interact(showMol, index=(0, nmol - 1, 1))

interactive(children=(IntSlider(value=429, description='index', max=859), Output()), _dom_classes=('widget-int…

<function __main__.showMol(index)>

In [10]:
#df1=df[['chembl-id','pot.(log,Ki)','non_stereo_aromatic_smiles']]
dfactives1=dfactives[['chembl-id','pot.(log,Ki)','non_stereo_aromatic_smiles']]
dfactives1.columns=['chembl-id','activity','smiles']
dfinactives1=dfinactives[['chembl-id','potencies','non_stereo_aromatic_smiles']]
dfinactives1.columns=['chembl-id','activity','smiles']
dfinactives1=dfinactives1[dfinactives1['activity']=='None']
dfinactives1['activity']=0.0
dfinactives1

Unnamed: 0,chembl-id,activity,smiles
3,CHEMBL2385123,0.0,Cc1c(c(nn1Cc2ccc(cc2)S(=O)(=O)C)c3ccccc3)CC(=O)O
4,CHEMBL2385111,0.0,Cc1c(c(nn1CC2CCCN2S(=O)(=O)c3ccccc3)c4ccccc4)C...
7,CHEMBL3099128,0.0,Cc1c(c(nn1CC(=O)O)OC)Cc2ccccc2S(=O)(=O)c3ccccc3
8,CHEMBL3338264,0.0,Cc1ccc2c(cn(c2n1)c3ccc(cc3Cc4cc(nn4C)C)Cl)CC(=O)O
9,CHEMBL3338265,0.0,Cc1ccc2c(cn(c2n1)c3ccccc3Cn4cc(nc4C)C)CC(=O)O
11,CHEMBL3338266,0.0,Cc1ccc2c(cn(c2n1)c3ccccc3Cn4c(nc(n4)C)C)CC(=O)O
12,CHEMBL3338262,0.0,COc1ccc(cc1c2ccc(cc2Cn3ccccc3=O)C(F)(F)F)CC(=O)O
14,CHEMBL2385119,0.0,Cc1c(c(nn1C(c2ccc(cc2)F)c3ccc(cc3)F)c4ccc(nc4)...
18,CHEMBL3099125,0.0,Cc1c(c(nn1CC(=O)O)c2ccccc2)Cc3ccccc3c4cccc5c4c...
20,CHEMBL3099100,0.0,Cc1c(c(nn1CC(=O)O)c2ccc(cc2)C(=O)N(C)C)Cc3cccc...


In [11]:
dfall=pd.concat([dfactives1,dfinactives1])
dfall

Unnamed: 0,chembl-id,activity,smiles
0,CHEMBL3639847,10.0,Cc1c2cc(ccc2[nH]c1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)...
1,CHEMBL3690164,10.0,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C...
2,CHEMBL3690165,10.0,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C)...
3,CHEMBL3690177,10.0,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n4...
4,CHEMBL3690178,10.0,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n...
...,...,...,...
18,CHEMBL3099125,0.0,Cc1c(c(nn1CC(=O)O)c2ccccc2)Cc3ccccc3c4cccc5c4c...
20,CHEMBL3099100,0.0,Cc1c(c(nn1CC(=O)O)c2ccc(cc2)C(=O)N(C)C)Cc3cccc...
21,CHEMBL1689116,0.0,c1ccc(cc1)S(=O)(=O)C2CCN(CC2)Cc3cc(ccc3OCC(=O)...
25,CHEMBL3099127,0.0,Cc1c(c(=O)[nH]n1CC(=O)O)Cc2ccccc2S(=O)(=O)c3cc...


In [12]:
# delete duplicated compounds
dfall=pd.concat([dfactives1,dfinactives1])
print("Before deletion of duplicated compounds: No. of cmpounds is %d"%len(dfall))
tfvec=dfall.duplicated(subset='chembl-id',keep=False)
dfall=dfall[~tfvec]
#print(tfvec)
tfvec=dfall.duplicated(subset='smiles',keep=False)
dfall=dfall[~tfvec]
#print(tfvec)
#print(len(dfall))
print("After deletion of duplicated compounds: No. of cmpounds is %d"%len(dfall))

dfall

Before deletion of duplicated compounds: No. of cmpounds is 873
After deletion of duplicated compounds: No. of cmpounds is 823


Unnamed: 0,chembl-id,activity,smiles
0,CHEMBL3639847,10.0,Cc1c2cc(ccc2[nH]c1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)...
1,CHEMBL3690164,10.0,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C...
2,CHEMBL3690165,10.0,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C)...
3,CHEMBL3690177,10.0,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n4...
4,CHEMBL3690178,10.0,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n...
...,...,...,...
18,CHEMBL3099125,0.0,Cc1c(c(nn1CC(=O)O)c2ccccc2)Cc3ccccc3c4cccc5c4c...
20,CHEMBL3099100,0.0,Cc1c(c(nn1CC(=O)O)c2ccc(cc2)C(=O)N(C)C)Cc3cccc...
21,CHEMBL1689116,0.0,c1ccc(cc1)S(=O)(=O)C2CCN(CC2)Cc3cc(ccc3OCC(=O)...
25,CHEMBL3099127,0.0,Cc1c(c(=O)[nH]n1CC(=O)O)Cc2ccccc2S(=O)(=O)c3cc...


In [13]:
dfout=dfall
#dfout=df1
#dfout=df1.sample(10000,random_state=0)
dfout.to_csv(tsvdir+'tid-%d-master.csv'%tid,index=False)
dfout

Unnamed: 0,chembl-id,activity,smiles
0,CHEMBL3639847,10.0,Cc1c2cc(ccc2[nH]c1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)...
1,CHEMBL3690164,10.0,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C...
2,CHEMBL3690165,10.0,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C)...
3,CHEMBL3690177,10.0,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n4...
4,CHEMBL3690178,10.0,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n...
...,...,...,...
18,CHEMBL3099125,0.0,Cc1c(c(nn1CC(=O)O)c2ccccc2)Cc3ccccc3c4cccc5c4c...
20,CHEMBL3099100,0.0,Cc1c(c(nn1CC(=O)O)c2ccc(cc2)C(=O)N(C)C)Cc3cccc...
21,CHEMBL1689116,0.0,c1ccc(cc1)S(=O)(=O)C2CCN(CC2)Cc3cc(ccc3OCC(=O)...
25,CHEMBL3099127,0.0,Cc1c(c(=O)[nH]n1CC(=O)O)Cc2ccccc2S(=O)(=O)c3cc...
