# dataCleansing tid-20174

In [1]:
import pytest
import os,sys
import itertools
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt
from rdkit.Chem.Scaffolds import MurckoScaffold
#sys.path.append(os.path.join(os.path.dirname(__file__), '../src/'))
sys.path.append('../')
from sphg.infra.xmol import xmol
from sphg.common.graph import Graph
from sphg.common.unique import unique
from sphg.common.scaffold import Scaffold
from sphg.common import pickle0
#from infra.phcg import LPhcg
#from database.ligand import Ligand
import sphg.common.molwt as molwt

| TID    | CHEMBLID   | Name                           | Uniprot |
| -----  | ---------- | ------------------------------ | ------- |
| 8      | CHEMBL1862 | Tyrosine  kinase ABL1          | P00519  |
| 11     | CHEMBL204  | Thrombin                       | P00734  |
| 137    | CHEMBL237  | 𝜅-opioid  receptor             | P41145  |
| 11362  | CHEMBL4005 | PI3-kinase  p110-alpha subunit | N/A     |
| 20174  | N/A  | G protein-coupled receptor 44          | N/A     |
| 104219 | N/A  | Transmembrane protease serine 6      | N/A     |

In [3]:
tid=20174
tsvdir='../../data/sphg/tid-%d/'%tid
files=os.listdir(tsvdir)
files

['tid-20174-actives.txt', 'tid-20174-inactives.txt']

In [4]:
dfactives=pd.read_csv(tsvdir+'tid-%d-actives.txt'%tid,delimiter='\t')
dfinactives=pd.read_csv(tsvdir+'tid-%d-inactives.txt'%tid,delimiter='\t')

In [5]:
dfactives

Unnamed: 0,funatsu_lab_id,chembl-id,"pot.(log,Ki)","pot.(nMol,Ki)",aromatic_smiles,non_stereo_aromatic_smieles,all-chembl-ids,no.-meas.
0,1556435,CHEMBL3639847,10.00,0.1,Cc1c2cc(ccc2[nH]c1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)...,Cc1c2cc(ccc2[nH]c1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)...,CHEMBL3639847,2
1,1560497,CHEMBL3690164,10.00,0.1,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C...,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C...,CHEMBL3690164,2
2,1560498,CHEMBL3690165,10.00,0.1,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C)...,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(cc3)Cn4c(c(c(n4)C)...,CHEMBL3690165,2
3,1560510,CHEMBL3690177,10.00,0.1,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n4...,CCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n4...,CHEMBL3690177,2
4,1560511,CHEMBL3690178,10.00,0.1,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n...,CCCn1c2cc(ccc2cc1C(=O)Nc3ccc(c(c3)F)Cn4c(c(c(n...,CHEMBL3690178,2
...,...,...,...,...,...,...,...,...
855,259538,CHEMBL180732,4.77,17000.0,c1ccc(cc1)CNc2ccc3c(c2)c4c(n3CCC(=O)O)CCCC4,c1ccc(cc1)CNc2ccc3c(c2)c4c(n3CCC(=O)O)CCCC4,CHEMBL180732,1
856,259522,CHEMBL360147,4.74,18000.0,CCC(C(=O)O)n1c2ccc(cc2c3c1CCCC3)NS(=O)(=O)c4cc...,CCC(C(=O)O)n1c2ccc(cc2c3c1CCCC3)NS(=O)(=O)c4cc...,CHEMBL360147,1
857,259602,CHEMBL426240,4.72,19000.0,c1cc(ccc1F)S(=O)(=O)Nc2ccc3c(c2)c4c(n3/C=C\C(=...,c1cc(ccc1F)S(=O)(=O)Nc2ccc3c(c2)c4c(n3C=CC(=O)...,CHEMBL426240,1
858,259450,CHEMBL180098,4.57,27000.0,c1cc(cc(c1)n2c3ccc(cc3c4c2CCCC4)NS(=O)(=O)c5cc...,c1cc(cc(c1)n2c3ccc(cc3c4c2CCCC4)NS(=O)(=O)c5cc...,CHEMBL180098,1


In [6]:
dfinactives

Unnamed: 0,funatsu_lab_id,chembl-id,potencies,units,aromatic_smiles,nonstereo_aromatic_smiles,all-chembl-ids,activity-comment
0,308524,CHEMBL379639,"5200.0,None,None","IC50,Activity,Activity",Cc1c(c2ccccc2n1c3ccncn3)CC(=O)O,Cc1c(c2ccccc2n1c3ccncn3)CC(=O)O,CHEMBL379639,"[None, 'Partial agonist', 'Not Active']"
1,308464,CHEMBL378666,"21.0,None","IC50,Activity",Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4,Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4,CHEMBL378666,"[None, 'Not Active']"
2,308430,CHEMBL378094,"125.0,None","IC50,Activity",Cc1ccc2c(c1)n(c(c2c3ccnc4c3ccc(c4)Cl)C)CC(=O)O,Cc1ccc2c(c1)n(c(c2c3ccnc4c3ccc(c4)Cl)C)CC(=O)O,CHEMBL378094,"[None, 'Not Active']"
3,1246815,CHEMBL2385123,,IC50,Cc1c(c(nn1Cc2ccc(cc2)S(=O)(=O)C)c3ccccc3)CC(=O)O,Cc1c(c(nn1Cc2ccc(cc2)S(=O)(=O)C)c3ccccc3)CC(=O)O,CHEMBL2385123,['Not Active']
4,1246584,CHEMBL2385111,,IC50,Cc1c(c(nn1CC2CCCN2S(=O)(=O)c3ccccc3)c4ccccc4)C...,Cc1c(c(nn1CC2CCCN2S(=O)(=O)c3ccccc3)c4ccccc4)C...,CHEMBL2385111,['Not Active']
5,308280,CHEMBL378730,"2.3,None","IC50,Activity",Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4Cl,Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4Cl,CHEMBL378730,"[None, 'Not Active']"
6,308334,CHEMBL213495,"18.0,None","IC50,Activity",Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4C#N,Cc1ccc2c(c1)c(c(n2CC(=O)O)C)c3ccnc4c3cccc4C#N,CHEMBL213495,"[None, 'Not Active']"
7,1283736,CHEMBL3099128,,IC50,Cc1c(c(nn1CC(=O)O)OC)Cc2ccccc2S(=O)(=O)c3ccccc3,Cc1c(c(nn1CC(=O)O)OC)Cc2ccccc2S(=O)(=O)c3ccccc3,CHEMBL3099128,['Not Active']
8,1381409,CHEMBL3338264,,IC50,Cc1ccc2c(cn(c2n1)c3ccc(cc3Cc4cc(nn4C)C)Cl)CC(=O)O,Cc1ccc2c(cn(c2n1)c3ccc(cc3Cc4cc(nn4C)C)Cl)CC(=O)O,CHEMBL3338264,['Not Active']
9,1381410,CHEMBL3338265,,IC50,Cc1ccc2c(cn(c2n1)c3ccccc3Cn4cc(nc4C)C)CC(=O)O,Cc1ccc2c(cn(c2n1)c3ccccc3Cn4cc(nc4C)C)CC(=O)O,CHEMBL3338265,['Not Active']


In [9]:
smileslist=list(dfactives['non_stereo_aromatic_smiles'])
activitiylist=list(dfactives['pot.(log,Ki)'])

KeyError: 'nonstereo_aromatic_smiles'

In [8]:

nmol=len(smileslist)
    #for imol, (chembl, smiles, activity) in enumerate(zip(chembl_list,smiles_list, activity_list)):
    #    if imol==44:
def showMol(index):
    imol=index
    smiles=smileslist[imol]
    mol=Chem.MolFromSmiles(smiles)
    xmol0 = xmol(mol,molid=0)
    xmol0.activity=activitiylist[imol]
    xmol0.getChemicalFeats()
    xmol0.show(grid=False,num=False)
    print('### calc. %d-th/%d activity=%6.2f'%(imol,nmol,xmol0.activity))
    print('SMILES = %s'%smiles)
    print(xmol0.featsAtomListGraph)
    print(xmol0.featsFamilyList)

NameError: name 'smileslist' is not defined

In [8]:
from ipywidgets import interact
interact(showMol, index=(0, nmol - 1, 1))

interactive(children=(IntSlider(value=458, description='index', max=917), Output()), _dom_classes=('widget-int…

<function __main__.showMol(index)>

In [13]:
#df1=df[['chembl-id','pot.(log,Ki)','non_stereo_aromatic_smiles']]
dfactives1=dfactives[['chembl-id','pot.(log,Ki)','non_stereo_aromatic_smiles']]
dfactives1.columns=['chembl-id','activity','smiles']
dfinactives1=dfinactives[['chembl-id','potencies','non_stereo_aromatic_smiles']]
dfinactives1.columns=['chembl-id','activity','smiles']
dfinactives1=dfinactives1[dfinactives1['activity']=='None']
dfinactives1['activity']=0.0
dfinactives1

Unnamed: 0,chembl-id,activity,smiles
0,CHEMBL404494,0.0,c1coc2c1c(c3cc4c(cc3n2)OCO4)n5ccnc5
1,CHEMBL254801,0.0,COc1ccc2c(c1)nc3c(c2Oc4cccnc4)cco3
2,CHEMBL403440,0.0,c1cc(ccc1[N+](=O)[O-])Sc2c3ccoc3nc4c2cc5c(c4)OCO5
3,CHEMBL255001,0.0,COc1ccc2c(c1)nc3c(c2[S+](c4ccc(cc4)[N+](=O)[O-...
4,CHEMBL2322237,0.0,c1c(=O)c2c(c(cs2)N3CCCCC3)oc1N4CCOCC4
5,CHEMBL258373,0.0,c1cc(ccc1[N+](=O)[O-])Oc2c3ccoc3nc4c2cc5c(c4)OCO5
6,CHEMBL402802,0.0,c1coc2c1c(c3cc4c(cc3n2)OCO4)n5cc(nc5)[N+](=O)[O-]
7,CHEMBL404676,0.0,c1cc(ccc1Oc2c3ccoc3nc4c2cc5c(c4)OCO5)F
8,CHEMBL3780032,0.0,Cc1cnc(nc1OCC(C)(C)CO)N
9,CHEMBL254998,0.0,COc1ccc2c(c1)nc3c(c2Oc4cccc(c4)[N+](=O)[O-])cco3


In [52]:
dfall=pd.concat([dfactives1,dfinactives1])
dfall

Unnamed: 0,chembl-id,activity,smiles
0,CHEMBL3770993,10.52,CC(C)n1c(ncn1)c2cn3c(n2)-c4ccc(cc4OCC3)N5CCCC5...
1,CHEMBL2381382,10.40,CC(C)n1c(nc(n1)N)c2nc-3c(s2)CCOc4c3ccc(c4)c5cn...
2,CHEMBL2381376,10.22,CC(C)n1c(ncn1)c2nc-3c(s2)CCOc4c3ccc(c4)c5cnn(c...
3,CHEMBL2381375,10.22,CC(C)n1c(ncn1)c2nc-3c(s2)CCOc4c3ccc(c4)c5cnn(c...
4,CHEMBL3770306,10.00,CC(C)n1c(ncn1)c2cn3c(n2)-c4ccc(cc4OCC3)N5CCC5C...
...,...,...,...
22,CHEMBL257791,0.00,COc1cc2c(cc1OC)nc3c(c2Sc4ccc(cc4)[N+](=O)[O-])...
23,CHEMBL403855,0.00,COc1cc2c(cc1OC)nc3c(c2n4cc(nc4)[N+](=O)[O-])cco3
24,CHEMBL257372,0.00,COc1cc2c(cc1OC)nc3c(c2Oc4cccnc4)cco3
25,CHEMBL3355474,0.00,CC1COCCN1c2c3c(cc(n2)c4cncc5c4cc[nH]5)n(cn3)C(...


In [58]:
# delete duplicated compounds
dfall=pd.concat([dfactives1,dfinactives1])
print("Before deletion of duplicated compounds: No. of cmpounds is %d"%len(dfall))
tfvec=dfall.duplicated(subset='chembl-id',keep=False)
dfall=dfall[~tfvec]
#print(tfvec)
tfvec=dfall.duplicated(subset='smiles',keep=False)
dfall=dfall[~tfvec]
#print(tfvec)
#print(len(dfall))
print("After deletion of duplicated compounds: No. of cmpounds is %d"%len(dfall))

dfall

Before deletion of duplicated compounds: No. of cmpounds is 944
After deletion of duplicated compounds: No. of cmpounds is 895


Unnamed: 0,chembl-id,activity,smiles
0,CHEMBL3770993,10.52,CC(C)n1c(ncn1)c2cn3c(n2)-c4ccc(cc4OCC3)N5CCCC5...
1,CHEMBL2381382,10.40,CC(C)n1c(nc(n1)N)c2nc-3c(s2)CCOc4c3ccc(c4)c5cn...
2,CHEMBL2381376,10.22,CC(C)n1c(ncn1)c2nc-3c(s2)CCOc4c3ccc(c4)c5cnn(c...
3,CHEMBL2381375,10.22,CC(C)n1c(ncn1)c2nc-3c(s2)CCOc4c3ccc(c4)c5cnn(c...
4,CHEMBL3770306,10.00,CC(C)n1c(ncn1)c2cn3c(n2)-c4ccc(cc4OCC3)N5CCC5C...
...,...,...,...
22,CHEMBL257791,0.00,COc1cc2c(cc1OC)nc3c(c2Sc4ccc(cc4)[N+](=O)[O-])...
23,CHEMBL403855,0.00,COc1cc2c(cc1OC)nc3c(c2n4cc(nc4)[N+](=O)[O-])cco3
24,CHEMBL257372,0.00,COc1cc2c(cc1OC)nc3c(c2Oc4cccnc4)cco3
25,CHEMBL3355474,0.00,CC1COCCN1c2c3c(cc(n2)c4cncc5c4cc[nH]5)n(cn3)C(...


In [59]:
dfout=dfall
#dfout=df1
#dfout=df1.sample(10000,random_state=0)
dfout.to_csv(tsvdir+'tid-%d-master.csv'%tid,index=False)
dfout

Unnamed: 0,chembl-id,activity,smiles
0,CHEMBL3770993,10.52,CC(C)n1c(ncn1)c2cn3c(n2)-c4ccc(cc4OCC3)N5CCCC5...
1,CHEMBL2381382,10.40,CC(C)n1c(nc(n1)N)c2nc-3c(s2)CCOc4c3ccc(c4)c5cn...
2,CHEMBL2381376,10.22,CC(C)n1c(ncn1)c2nc-3c(s2)CCOc4c3ccc(c4)c5cnn(c...
3,CHEMBL2381375,10.22,CC(C)n1c(ncn1)c2nc-3c(s2)CCOc4c3ccc(c4)c5cnn(c...
4,CHEMBL3770306,10.00,CC(C)n1c(ncn1)c2cn3c(n2)-c4ccc(cc4OCC3)N5CCC5C...
...,...,...,...
22,CHEMBL257791,0.00,COc1cc2c(cc1OC)nc3c(c2Sc4ccc(cc4)[N+](=O)[O-])...
23,CHEMBL403855,0.00,COc1cc2c(cc1OC)nc3c(c2n4cc(nc4)[N+](=O)[O-])cco3
24,CHEMBL257372,0.00,COc1cc2c(cc1OC)nc3c(c2Oc4cccnc4)cco3
25,CHEMBL3355474,0.00,CC1COCCN1c2c3c(cc(n2)c4cncc5c4cc[nH]5)n(cn3)C(...
