# Create dataset with pairs of anti-glioblastoma drugs - nanoparticles

In [4]:
import pandas as pd
import numpy as np
import feather

## Modify datasets

Read initial data for drugs:

In [3]:
df_d = pd.read_csv('./datasets/drug(neuro).csv')
df_d.shape

(14061, 39)

In [4]:
# remove duplicates in drugs data
print('Before:', df_d.shape)
df_d.drop_duplicates(keep=False, inplace=True)
print('After :', df_d.shape)

Before: (14061, 39)
After : (14061, 39)


In [6]:
df_d.head()

Unnamed: 0,d_No,CMPD_CHEMBLID,NVR5,PSA,ALOGP,MW,cutoff,vij,c0=Activity,c1=CELL_NAME,...,d_DPSA(c5),d_DALOGP(c5),d_DPSA(c6),d_DALOGP(c6),d_DPSA(c7),d_DALOGP(c7),d_DPSA(c8),d_DALOGP(c8),SMILE,REF
0,1,CHEMBL1358797,0,54.12,3.15,294.35,100,99476.0,EC50 nM,H4,...,5.7133,-0.6609,-17.129,-0.2931,-15.5476,-0.3182,-16.2286,-0.3169,Nc1ccc(Sc2cc(Cl)nc(N)n2)cc1,"15974577, J. Med. Chem. J. Med. Chem. 2005, 48..."
1,2,CHEMBL1170485,0,26.3,2.57,200.24,100,98342.0,EC50 nM,H4,...,-22.1067,-1.2409,-44.949,-0.8731,-43.3676,-0.8982,-44.0486,-0.8969,CCCc1nnc(NC(=O)C2=C(O)c3cccc4CCCN(C2=O)c34)s1,"12904064, J. Med. Chem. J. Med. Chem. 2003, 46..."
2,3,CHEMBL3197411,0,27.63,3.14,326.26,100,96660.0,EC50 nM,H4,...,-20.7767,-0.6709,-43.619,-0.3031,-42.0376,-0.3282,-42.7186,-0.3269,CCN(CC)c1ccc(\C=N\NC(=O)Nc2nonc2N)cc1,"12904064, J. Med. Chem. J. Med. Chem. 2003, 46..."
3,4,CHEMBL1531352,0,58.64,3.09,331.2,100,92479.0,EC50 nM,H4,...,10.2333,-0.7209,-12.609,-0.3531,-11.0276,-0.3782,-11.7086,-0.3769,N1c2ccccc2Sc3cc4ccccc4cc13,"11563926, J. Med. Chem. J. Med. Chem. 2001, 44..."
4,5,CHEMBL1412704,0,54.88,4.66,372.31,100,81785.0,EC50 nM,H4,...,6.4733,0.8491,-16.369,1.2169,-14.7876,1.1918,-15.4686,1.1931,COc1ccc(cc1)C(=O)\C=C\Sc2oc3ccccc3n2,"11563926, J. Med. Chem. J. Med. Chem. 2001, 44..."


Read initial data for nanoparticles:

In [7]:
df_np = pd.read_csv('./datasets/nano(neuro).csv')
df_np.shape

(260, 134)

In [8]:
# remove duplicates in NPs data
print('Before:', df_np.shape)
df_np.drop_duplicates(keep=False, inplace=True)
print('After :', df_np.shape)

Before: (260, 134)
After : (260, 134)


In [9]:
df_np.head()

Unnamed: 0,np_No,Type,SMILES(NP),vij(np),cutoff,c0(np),c1(np),c2(np),c3(np),c4(np),...,np_DTPSA(Tot)coat(c5),np_DALOGPcoat(c5),np_DALOGP2coat(c5),np_DSAtotcoat(c5),np_DSAacccoat(c5),np_DSAdoncoat(c5),np_DVxcoat(c5),np_DVvdwMGcoat(c5),np_DVvdwZAZcoat(c5),np_DPDIcoat(c5)
0,1,SiO2,O=[Si]=O,16644.47403,3099.060596,CC50 (uM),A549 (H),spherical,Dry,UC,...,0,0,0,0,0,0,0,0,0,0
1,2,SiO2,O=[Si]=O,16644.47403,3099.060596,CC50 (uM),A549 (H),spherical,Dry,PEG-Si(OMe)3,...,0,0,0,0,0,0,0,0,0,0
2,3,SiO2,O=[Si]=O,16644.47403,3099.060596,CC50 (uM),BMSC (H),spherical,Dry,UC,...,0,0,0,0,0,0,0,0,0,0
3,4,SiO2,O=[Si]=O,16644.47403,3099.060596,CC50 (uM),BMSC (H),spherical,Dry,PEG-Si(OMe)3,...,0,0,0,0,0,0,0,0,0,0
4,5,SiO2,O=[Si]=O,16644.47403,3099.060596,CC50 (uM),BMSC (M),spherical,Dry,UC,...,0,0,0,0,0,0,0,0,0,0


## Cutoffs

Define the cutoff values for biological activities for drugs and nanoparticles in order to create the output variable as a class (0/1 values).

### Drug cutoff

In [10]:
print(list(df_d.columns))

['d_No', 'CMPD_CHEMBLID', 'NVR5', 'PSA', 'ALOGP', 'MW', 'cutoff', 'vij', 'c0=Activity', 'c1=CELL_NAME', 'c2=ORGANISM', 'c3=TARGET_TYPE', 'c4=ASSAY_ORGANISM', 'c5=TARGETMAPPING', 'c6=CONFIDENCE', 'c7=CURATEDBY', 'c8=ASSAYTYPE', 'f(vij)obs', 'set', 'd_DPSA(c0)', 'd_DALOGP(c0)', 'd_DPSA(c1)', 'd_DALOG(c1)', 'd_DPSA(c2)', 'd_DALOGP(c2)', 'd_DPSA(c3)', 'd_DALOGP(c3)', 'd_DPSA(c4)', 'd_DALOGP(c4)', 'd_DPSA(c5)', 'd_DALOGP(c5)', 'd_DPSA(c6)', 'd_DALOGP(c6)', 'd_DPSA(c7)', 'd_DALOGP(c7)', 'd_DPSA(c8)', 'd_DALOGP(c8)', 'SMILE', 'REF']


In [11]:
# verify cutoff values
set(df_d['cutoff'])

{100}

In [12]:
# verify c0
grouped = df_d[['c0=Activity','vij']].groupby('c0=Activity')
# how many examples by each c0
grouped.count()

Unnamed: 0_level_0,vij
c0=Activity,Unnamed: 1_level_1
EC50 nM,13653
EC50 ug.mL-1,4
IC50 nM,366
IC50 ug.mL-1,13
LC50 nM,25


In [13]:
#remove some drug Activities
df_d = df_d[df_d['c0=Activity']!='EC50 ug.mL-1']
df_d = df_d[df_d['c0=Activity']!='IC50 ug.mL-1']
df_d.shape

(14044, 39)

In [14]:
# verify c0
grouped = df_d[['c0=Activity','vij']].groupby('c0=Activity')
# how many examples by each c0
grouped.count()

Unnamed: 0_level_0,vij
c0=Activity,Unnamed: 1_level_1
EC50 nM,13653
IC50 nM,366
LC50 nM,25


In [15]:
# verify c0 for drugs - median values
grouped = df_d[['c0=Activity','vij']].groupby('c0=Activity')
grouped.median()

Unnamed: 0_level_0,vij
c0=Activity,Unnamed: 1_level_1
EC50 nM,15023.0
IC50 nM,10000.0
LC50 nM,42000.0


In [16]:
# create logaritm of activity
df_d['log_vij']=np.log(df_d['vij']+1E-15)

In [17]:
# verify c0 for drugs - median values
grouped = df_d[['c0=Activity','log_vij']].groupby('c0=Activity')
grouped.median()

Unnamed: 0_level_0,log_vij
c0=Activity,Unnamed: 1_level_1
EC50 nM,9.617338
IC50 nM,9.21034
LC50 nM,10.645425


In [18]:
grouped.describe()

Unnamed: 0_level_0,log_vij,log_vij,log_vij,log_vij,log_vij,log_vij,log_vij,log_vij
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
c0=Activity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
EC50 nM,13653.0,9.357031,2.195091,1.110223e-15,8.580919,9.617338,11.695247,12.180755
IC50 nM,366.0,6.901259,5.449552,-34.53878,4.60517,9.21034,9.21034,12.429216
LC50 nM,25.0,10.721798,0.876695,9.392662,9.998798,10.645425,10.968198,12.32829


In [19]:
# set drug cutoffs
df_d.loc[df_d['c0=Activity'] == 'EC50 nM', 'cutoff'] = 10
df_d.loc[df_d['c0=Activity'] == 'IC50 nM', 'cutoff'] = 10
df_d.loc[df_d['c0=Activity'] == 'LC50 nM', 'cutoff'] = 10

In [20]:
# make other value before modification
df_d['f(vij)obs'] = -1
df_d['f(vij)obs'] = np.where(df_d['log_vij'] < df_d['cutoff'], 1, 0)
print('Values=',list(set(df_d['f(vij)obs'])))
print('Count=',df_d['f(vij)obs'].count())
print('Sum=',df_d['f(vij)obs'].sum())

Values= [0, 1]
Count= 14044
Sum= 8393


In [21]:
# raw dataset ONLY DRUGS (with extra columns!)
df_d.to_csv('./drug_1_cutoff.csv', index=False)

### NP cutoff

In [22]:
print(list(df_np.columns))
# verify cutoff values
set(df_np['cutoff'])

['np_No', 'Type', 'SMILES(NP)', 'vij(np)', 'cutoff', 'c0(np)', 'c1(np)', 'c2(np)', 'c3(np)', 'c4(np)', 'SMILES (sc)', 'Observations', 'f(vijnp)', 'time', 'NMUnp', 'Lnp', 'Vnpu', 'Enpu', 'Pnpu', 'Uccoat', 'Uicoat', 'Hycoat', 'AMRcoat', 'TPSA(NO)coat', 'TPSA(Tot)coat', 'ALOGPcoat', 'ALOGP2coat', 'SAtotcoat', 'SAacccoat', 'SAdoncoat', 'Vxcoat', 'VvdwMGcoat', 'VvdwZAZcoat', 'PDIcoat', 'np_DNMUnp(c0)', 'np_DLnp(c0)', 'np_DVnpu(c0)', 'np_DEnpu(c0)', 'np_DPnpu(c0)', 'np_DUccoat(c0)', 'np_DUicoat(c0)', 'np_DHycoat(c0)', 'np_DAMRcoat(c0)', 'np_DTPSA(NO)coat(c0)', 'np_DTPSA(Tot)coat(c0)', 'np_DALOGPcoat(c0)', 'np_DALOGP2coat(c0)', 'np_DSAtotcoat(c0)', 'np_DSAacccoat(c0)', 'np_DSAdoncoat(c0)', 'np_DVxcoat(c0)', 'np_DVvdwMGcoat(c0)', 'np_DVvdwZAZcoat(c0)', 'np_DPDIcoat(c0)', 'np_DNMUnp(c1)', 'np_DLnp(c1)', 'np_DVnpu(c1)', 'np_DEnpu(c1)', 'np_DPnpu(c1)', 'np_DUccoat(c1)', 'np_DUicoat(c1)', 'np_DHycoat(c1)', 'np_DAMRcoat(c1)', 'np_DTPSA(NO)coat(c1)', 'np_DTPSA(Tot)coat(c1)', 'np_DALOGPcoat(c1)', 'np

{3099.0605960000003, 5224.5211899999995, 7610.649702, 18713.85962, 25421.6579}

In [23]:
# verify c0
grouped_np = df_np[['c0(np)','vij(np)']].groupby('c0(np)')
# how many examples by each c0
grouped_np.count()

Unnamed: 0_level_0,vij(np)
c0(np),Unnamed: 1_level_1
CC50 (uM),113
EC50 (uM),30
IC50 (uM)p,29
LC50 (uM),69
TC50 (uM),19


In [24]:
grouped_np = df_np[['c0(np)','vij(np)']].groupby('c0(np)')
# how many examples by each c0
grouped_np.median()

Unnamed: 0_level_0,vij(np)
c0(np),Unnamed: 1_level_1
CC50 (uM),503.572215
EC50 (uM),262.837339
IC50 (uM)p,463.529499
LC50 (uM),125.209726
TC50 (uM),507.699259


In [25]:
# create logaritm of activity
df_np['log_vij(np)']=np.log(df_np['vij(np)']+1E-15)

In [26]:
grouped_np = df_np[['c0(np)','log_vij(np)']].groupby('c0(np)')
# how many examples by each c0
grouped_np.median()

Unnamed: 0_level_0,log_vij(np)
c0(np),Unnamed: 1_level_1
CC50 (uM),6.221727
EC50 (uM),5.571486
IC50 (uM)p,6.13887
LC50 (uM),4.82999
TC50 (uM),6.229889


In [27]:
grouped_np.describe()

Unnamed: 0_level_0,log_vij(np),log_vij(np),log_vij(np),log_vij(np),log_vij(np),log_vij(np),log_vij(np),log_vij(np)
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
c0(np),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
CC50 (uM),113.0,6.00018,2.579729,-0.850365,5.247155,6.221727,7.417248,9.719834
EC50 (uM),30.0,4.701392,4.095544,-4.074542,1.874161,5.571486,6.785517,12.430893
IC50 (uM)p,29.0,7.627178,2.45041,4.010463,6.12539,6.13887,11.044598,11.044598
LC50 (uM),69.0,4.33592,2.898911,-3.623118,2.694116,4.82999,5.91517,12.430893
TC50 (uM),19.0,5.903886,2.952328,1.511131,3.121622,6.229889,7.937426,10.709185


In [28]:
# set NP cutoffs
df_np['cutoff'] = 6

In [29]:
# make other value before modification
df_np['f(vijnp)'] = -1
df_np['f(vijnp)'] = np.where(df_np['log_vij(np)'] < df_np['cutoff'], 1, 0)
print('Values=',list(set(df_np['f(vijnp)'])))
print('Count=',df_np['f(vijnp)'].count())
print('Sum=',df_np['f(vijnp)'].sum())

Values= [0, 1]
Count= 260
Sum= 137


In [30]:
# raw dataset ONLY NPs (with extra columns!)
df_np.to_csv('./NP_1_cutoff.csv', index=False)

## Desiderability

### Desiderability for drugs

In [31]:
# verify c0
grouped = df_d[['c0=Activity','log_vij']].groupby('c0=Activity')
# how many examples by each c0
grouped.count()

Unnamed: 0_level_0,log_vij
c0=Activity,Unnamed: 1_level_1
EC50 nM,13653
IC50 nM,366
LC50 nM,25


In [32]:
# set drug desirabilities
df_d.loc[df_d['c0=Activity'] == 'EC50 nM', 'Desirability'] = -1
df_d.loc[df_d['c0=Activity'] == 'IC50 nM', 'Desirability'] = -1
df_d.loc[df_d['c0=Activity'] == 'LC50 nM', 'Desirability'] = 1

In [33]:
set(df_d['Desirability'])

{-1.0, 1.0}

### Desiderability for NPs

In [34]:
grouped_np = df_np[['c0(np)','log_vij(np)']].groupby('c0(np)')
# how many examples by each c0
grouped_np.count()

Unnamed: 0_level_0,log_vij(np)
c0(np),Unnamed: 1_level_1
CC50 (uM),113
EC50 (uM),30
IC50 (uM)p,29
LC50 (uM),69
TC50 (uM),19


In [35]:
# set NP desirabilities
df_np.loc[df_np['c0(np)'] == 'CC50 (uM)', 'Desirability']   = 1
df_np.loc[df_np['c0(np)'] == 'EC50 (uM)', 'Desirability']   = -1
df_np.loc[df_np['c0(np)'] == 'IC50 (uM)p', 'Desirability']  = -1
df_np.loc[df_np['c0(np)'] == 'LC50 (uM)', 'Desirability']   = 1
df_np.loc[df_np['c0(np)'] == 'TC50 (uM)', 'Desirability']   = 1

In [36]:
set(df_np['Desirability'])

{-1.0, 1.0}

## Good - Bad

Two new columns (one for drugs, other for NPs) for future calculation of the final output variable:

In [37]:
df_d.loc[(df_d['Desirability'] == 1) & (df_d['log_vij'] > df_d['cutoff']), 'd_Good_Bad'] = 'Good'
df_d.loc[(df_d['Desirability'] == 1) & (df_d['log_vij'] < df_d['cutoff']), 'd_Good_Bad'] = 'Bad'
df_d.loc[(df_d['Desirability'] == -1) & (df_d['log_vij'] < df_d['cutoff']), 'd_Good_Bad'] = 'Good'
df_d.loc[(df_d['Desirability'] == -1) & (df_d['log_vij'] > df_d['cutoff']), 'd_Good_Bad'] = 'Bad'

In [38]:
set(df_d['d_Good_Bad'])

{'Bad', 'Good'}

In [39]:
df_d['d_Good_Bad'].value_counts()

Good    8404
Bad     5640
Name: d_Good_Bad, dtype: int64

In [40]:
df_np.loc[(df_np['Desirability'] ==  1) & (df_np['log_vij(np)'] > df_np['cutoff']), 'np_Good_Bad'] = 'Good'
df_np.loc[(df_np['Desirability'] ==  1) & (df_np['log_vij(np)'] < df_np['cutoff']), 'np_Good_Bad'] = 'Bad'
df_np.loc[(df_np['Desirability'] == -1) & (df_np['log_vij(np)'] < df_np['cutoff']), 'np_Good_Bad'] = 'Good'
df_np.loc[(df_np['Desirability'] == -1) & (df_np['log_vij(np)'] > df_np['cutoff']), 'np_Good_Bad'] = 'Bad'

In [41]:
set(df_np['np_Good_Bad'])

{'Bad', 'Good'}

In [42]:
df_np

Unnamed: 0,np_No,Type,SMILES(NP),vij(np),cutoff,c0(np),c1(np),c2(np),c3(np),c4(np),...,np_DSAtotcoat(c5),np_DSAacccoat(c5),np_DSAdoncoat(c5),np_DVxcoat(c5),np_DVvdwMGcoat(c5),np_DVvdwZAZcoat(c5),np_DPDIcoat(c5),log_vij(np),Desirability,np_Good_Bad
0,1,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),A549 (H),spherical,Dry,UC,...,0,0,0,0,0,0,0,9.719834,1.0,Good
1,2,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),A549 (H),spherical,Dry,PEG-Si(OMe)3,...,0,0,0,0,0,0,0,9.719834,1.0,Good
2,3,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),BMSC (H),spherical,Dry,UC,...,0,0,0,0,0,0,0,9.719834,1.0,Good
3,4,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),BMSC (H),spherical,Dry,PEG-Si(OMe)3,...,0,0,0,0,0,0,0,9.719834,1.0,Good
4,5,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),BMSC (M),spherical,Dry,UC,...,0,0,0,0,0,0,0,9.719834,1.0,Good
5,6,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),BMSC (M),spherical,Dry,PEG-Si(OMe)3,...,0,0,0,0,0,0,0,9.719834,1.0,Good
6,7,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),HEK293 (H),spherical,Dry,UC,...,0,0,0,0,0,0,0,9.719834,1.0,Good
7,8,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),HEK293 (H),spherical,Dry,PEG-Si(OMe)3,...,0,0,0,0,0,0,0,9.719834,1.0,Good
8,9,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),HepG2 (H),spherical,Dry,UC,...,0,0,0,0,0,0,0,9.719834,1.0,Good
9,10,SiO2,O=[Si]=O,16644.474030,6,CC50 (uM),HepG2 (H),spherical,Dry,PEG-Si(OMe)3,...,0,0,0,0,0,0,0,9.719834,1.0,Good


In [43]:
df_np['np_Good_Bad'].value_counts()

Bad     144
Good    116
Name: np_Good_Bad, dtype: int64

In [44]:
df_d.to_csv('./drug_2_DesiderabilityGoodBad.csv', index=False)

In [45]:
df_np.to_csv('./NP_2_DesiderabilityGoodBad.csv', index=False)

## Select descriptors for drugs and NPs

These data will be merged to create pairs.

In [46]:
print(list(df_d.columns))

['d_No', 'CMPD_CHEMBLID', 'NVR5', 'PSA', 'ALOGP', 'MW', 'cutoff', 'vij', 'c0=Activity', 'c1=CELL_NAME', 'c2=ORGANISM', 'c3=TARGET_TYPE', 'c4=ASSAY_ORGANISM', 'c5=TARGETMAPPING', 'c6=CONFIDENCE', 'c7=CURATEDBY', 'c8=ASSAYTYPE', 'f(vij)obs', 'set', 'd_DPSA(c0)', 'd_DALOGP(c0)', 'd_DPSA(c1)', 'd_DALOG(c1)', 'd_DPSA(c2)', 'd_DALOGP(c2)', 'd_DPSA(c3)', 'd_DALOGP(c3)', 'd_DPSA(c4)', 'd_DALOGP(c4)', 'd_DPSA(c5)', 'd_DALOGP(c5)', 'd_DPSA(c6)', 'd_DALOGP(c6)', 'd_DPSA(c7)', 'd_DALOGP(c7)', 'd_DPSA(c8)', 'd_DALOGP(c8)', 'SMILE', 'REF', 'log_vij', 'Desirability', 'd_Good_Bad']


In [47]:
drug_cols = ['c0=Activity','d_Good_Bad','d_DPSA(c0)', 'd_DALOGP(c0)', 'd_DPSA(c1)', 'd_DALOG(c1)', 
             'd_DPSA(c2)', 'd_DALOGP(c2)', 'd_DPSA(c3)', 'd_DALOGP(c3)', 'd_DPSA(c4)', 
             'd_DALOGP(c4)', 'd_DPSA(c5)', 'd_DALOGP(c5)', 'd_DPSA(c6)', 'd_DALOGP(c6)', 
             'd_DPSA(c7)', 'd_DALOGP(c7)', 'd_DPSA(c8)', 'd_DALOGP(c8)']

In [48]:
print(list(df_np.columns))

['np_No', 'Type', 'SMILES(NP)', 'vij(np)', 'cutoff', 'c0(np)', 'c1(np)', 'c2(np)', 'c3(np)', 'c4(np)', 'SMILES (sc)', 'Observations', 'f(vijnp)', 'time', 'NMUnp', 'Lnp', 'Vnpu', 'Enpu', 'Pnpu', 'Uccoat', 'Uicoat', 'Hycoat', 'AMRcoat', 'TPSA(NO)coat', 'TPSA(Tot)coat', 'ALOGPcoat', 'ALOGP2coat', 'SAtotcoat', 'SAacccoat', 'SAdoncoat', 'Vxcoat', 'VvdwMGcoat', 'VvdwZAZcoat', 'PDIcoat', 'np_DNMUnp(c0)', 'np_DLnp(c0)', 'np_DVnpu(c0)', 'np_DEnpu(c0)', 'np_DPnpu(c0)', 'np_DUccoat(c0)', 'np_DUicoat(c0)', 'np_DHycoat(c0)', 'np_DAMRcoat(c0)', 'np_DTPSA(NO)coat(c0)', 'np_DTPSA(Tot)coat(c0)', 'np_DALOGPcoat(c0)', 'np_DALOGP2coat(c0)', 'np_DSAtotcoat(c0)', 'np_DSAacccoat(c0)', 'np_DSAdoncoat(c0)', 'np_DVxcoat(c0)', 'np_DVvdwMGcoat(c0)', 'np_DVvdwZAZcoat(c0)', 'np_DPDIcoat(c0)', 'np_DNMUnp(c1)', 'np_DLnp(c1)', 'np_DVnpu(c1)', 'np_DEnpu(c1)', 'np_DPnpu(c1)', 'np_DUccoat(c1)', 'np_DUicoat(c1)', 'np_DHycoat(c1)', 'np_DAMRcoat(c1)', 'np_DTPSA(NO)coat(c1)', 'np_DTPSA(Tot)coat(c1)', 'np_DALOGPcoat(c1)', 'np

In [49]:
np_cols = ['c0(np)','np_Good_Bad','np_DNMUnp(c0)', 'np_DLnp(c0)', 'np_DVnpu(c0)', 'np_DEnpu(c0)', 
           'np_DPnpu(c0)', 'np_DUccoat(c0)',
           'np_DUicoat(c0)', 'np_DHycoat(c0)', 'np_DAMRcoat(c0)', 'np_DTPSA(NO)coat(c0)', 'np_DTPSA(Tot)coat(c0)', 'np_DALOGPcoat(c0)',
           'np_DALOGP2coat(c0)', 'np_DSAtotcoat(c0)', 'np_DSAacccoat(c0)', 'np_DSAdoncoat(c0)', 'np_DVxcoat(c0)', 'np_DVvdwMGcoat(c0)',
           'np_DVvdwZAZcoat(c0)', 'np_DPDIcoat(c0)', 'np_DNMUnp(c1)', 'np_DLnp(c1)', 'np_DVnpu(c1)', 'np_DEnpu(c1)', 'np_DPnpu(c1)',
           'np_DUccoat(c1)', 'np_DUicoat(c1)', 'np_DHycoat(c1)', 'np_DAMRcoat(c1)', 'np_DTPSA(NO)coat(c1)', 'np_DTPSA(Tot)coat(c1)',
           'np_DALOGPcoat(c1)', 'np_DALOGP2coat(c1)', 'np_DSAtotcoat(c1)', 'np_DSAacccoat(c1)', 'np_DSAdoncoat(c1)', 'np_DVxcoat(c1)',
           'np_DVvdwMGcoat(c1)', 'np_DVvdwZAZcoat(c1)', 'np_DPDIcoat(c1)', 'np_DNMUnp(c2)', 'np_DLnp(c2)', 'np_DVnpu(c2)', 'np_DEnpu(c2)',
           'np_DPnpu(c2)', 'np_DUccoat(c2)', 'np_DUicoat(c2)', 'np_DHycoat(c2)', 'np_DAMRcoat(c2)', 'np_DTPSA(NO)coat(c2)', 
           'np_DTPSA(Tot)coat(c2)', 'np_DALOGPcoat(c2)', 'np_DALOGP2coat(c2)', 'np_DSAtotcoat(c2)', 'np_DSAacccoat(c2)', 
           'np_DSAdoncoat(c2)', 'np_DVxcoat(c2)', 'np_DVvdwMGcoat(c2)', 'np_DVvdwZAZcoat(c2)', 'np_DPDIcoat(c2)', 'np_DNMUnp(c3)',
           'np_DLnp(c3)', 'np_DVnpu(c3)', 'np_DEnpu(c3)', 'np_DPnpu(c3)', 'np_DUccoat(c3)', 'np_DUicoat(c3)', 'np_DHycoat(c3)',
           'np_DAMRcoat(c3)', 'np_DTPSA(NO)coat(c3)', 'np_DTPSA(Tot)coat(c3)', 'np_DALOGPcoat(c3)', 'np_DALOGP2coat(c3)',
           'np_DSAtotcoat(c3)', 'np_DSAacccoat(c3)', 'np_DSAdoncoat(c3)', 'np_DVxcoat(c3)', 'np_DVvdwMGcoat(c3)', 'np_DVvdwZAZcoat(c3)',
           'np_DPDIcoat(c3)', 'np_DNMUnp(c5)', 'np_DLnp(c5)', 'np_DVnpu(c5)', 'np_DEnpu(c5)', 'np_DPnpu(c5)', 'np_DUccoat(c5)',
           'np_DUicoat(c5)', 'np_DHycoat(c5)', 'np_DAMRcoat(c5)', 'np_DTPSA(NO)coat(c5)', 'np_DTPSA(Tot)coat(c5)', 'np_DALOGPcoat(c5)',
           'np_DALOGP2coat(c5)', 'np_DSAtotcoat(c5)', 'np_DSAacccoat(c5)', 'np_DSAdoncoat(c5)', 'np_DVxcoat(c5)', 'np_DVvdwMGcoat(c5)',
           'np_DVvdwZAZcoat(c5)', 'np_DPDIcoat(c5)']

In [50]:
# get only some columns (descriptors + Good_Bad)
df_d2 = df_d[drug_cols].copy()
df_np2= df_np[np_cols].copy()

In [51]:
df_d2.shape

(14044, 20)

In [52]:
df_np2.shape

(260, 102)

We are combining 14044 drugs data with 260 NP data using 18 drug descriptors and 100 NP descriptors.

In [53]:
df_d.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            14051, 14052, 14053, 14054, 14055, 14056, 14057, 14058, 14059,
            14060],
           dtype='int64', length=14044)

In [54]:
df_np.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            250, 251, 252, 253, 254, 255, 256, 257, 258, 259],
           dtype='int64', length=260)

In [55]:
# simulate pairs 
pairs = 0
for d_index in df_d2.index:
    for np_index in df_np2.index:
        pairs +=1
print('Total pairs drug - np = ', pairs)

Total pairs drug - np =  3651440


In [56]:
# create temporal columns to combine both dataframes
df_d2['tmp']  = 1
df_np2['tmp'] = 1

In [57]:
# merge dataframes
df_pairs = pd.merge(df_d2, df_np2, on=['tmp'])
# remove temporal column
df_pairs = df_pairs.drop('tmp', axis=1)

In [58]:
df_pairs.shape

(3651440, 122)

In [59]:
df_pairs.head(2)

Unnamed: 0,c0=Activity,d_Good_Bad,d_DPSA(c0),d_DALOGP(c0),d_DPSA(c1),d_DALOG(c1),d_DPSA(c2),d_DALOGP(c2),d_DPSA(c3),d_DALOGP(c3),...,np_DTPSA(Tot)coat(c5),np_DALOGPcoat(c5),np_DALOGP2coat(c5),np_DSAtotcoat(c5),np_DSAacccoat(c5),np_DSAdoncoat(c5),np_DVxcoat(c5),np_DVvdwMGcoat(c5),np_DVvdwZAZcoat(c5),np_DPDIcoat(c5)
0,EC50 nM,Bad,-15.649825,-0.313603,-18.277893,-0.305849,-12.0831,-0.2136,-12.1839,-0.2131,...,0,0,0,0,0,0,0,0,0,0
1,EC50 nM,Bad,-15.649825,-0.313603,-18.277893,-0.305849,-12.0831,-0.2136,-12.1839,-0.2131,...,0,0,0,0,0,0,0,0,0,0


## Final class using Good/Bad of drugs and NPs

In [60]:
df_pairs['Class'] = 0 # default is bad
df_pairs.loc[(df_pairs['d_Good_Bad'] ==  'Good') & (df_pairs['np_Good_Bad'] == 'Good'), 'Class'] = 1

## Add c0 pairs Probability

In [82]:
groupedp = df_pairs[['Class','c0=Activity','c0(np)']].groupby(['c0=Activity','c0(np)'])
df_prob = groupedp.count().reset_index()
df_prob

Unnamed: 0,c0=Activity,c0(np),Class
0,EC50 nM,CC50 (uM),1542789
1,EC50 nM,EC50 (uM),409590
2,EC50 nM,IC50 (uM)p,395937
3,EC50 nM,LC50 (uM),942057
4,EC50 nM,TC50 (uM),259407
5,IC50 nM,CC50 (uM),41358
6,IC50 nM,EC50 (uM),10980
7,IC50 nM,IC50 (uM)p,10614
8,IC50 nM,LC50 (uM),25254
9,IC50 nM,TC50 (uM),6954


In [83]:
df_prob.rename(columns={'Class': 'Counts'}, inplace=True)
df_prob

Unnamed: 0,c0=Activity,c0(np),Counts
0,EC50 nM,CC50 (uM),1542789
1,EC50 nM,EC50 (uM),409590
2,EC50 nM,IC50 (uM)p,395937
3,EC50 nM,LC50 (uM),942057
4,EC50 nM,TC50 (uM),259407
5,IC50 nM,CC50 (uM),41358
6,IC50 nM,EC50 (uM),10980
7,IC50 nM,IC50 (uM)p,10614
8,IC50 nM,LC50 (uM),25254
9,IC50 nM,TC50 (uM),6954


In [84]:
Total_c0s = df_prob['Counts'].sum()
Total_c0s

3651440

In [85]:
#calculate probability of pairs of c0 drug - nano
df_prob['probability'] = df_prob['Counts']/Total_c0s
df_prob

Unnamed: 0,c0=Activity,c0(np),Counts,probability
0,EC50 nM,CC50 (uM),1542789,0.422515
1,EC50 nM,EC50 (uM),409590,0.112172
2,EC50 nM,IC50 (uM)p,395937,0.108433
3,EC50 nM,LC50 (uM),942057,0.257996
4,EC50 nM,TC50 (uM),259407,0.071042
5,IC50 nM,CC50 (uM),41358,0.011326
6,IC50 nM,EC50 (uM),10980,0.003007
7,IC50 nM,IC50 (uM)p,10614,0.002907
8,IC50 nM,LC50 (uM),25254,0.006916
9,IC50 nM,TC50 (uM),6954,0.001904


In [86]:
df_pairs2 = pd.merge(df_prob, df_pairs, on=['c0=Activity', 'c0(np)'])

In [88]:
df_pairs2.head(2)

Unnamed: 0,c0=Activity,c0(np),Counts,probability,d_Good_Bad,d_DPSA(c0),d_DALOGP(c0),d_DPSA(c1),d_DALOG(c1),d_DPSA(c2),...,np_DALOGPcoat(c5),np_DALOGP2coat(c5),np_DSAtotcoat(c5),np_DSAacccoat(c5),np_DSAdoncoat(c5),np_DVxcoat(c5),np_DVvdwMGcoat(c5),np_DVvdwZAZcoat(c5),np_DPDIcoat(c5),Class
0,EC50 nM,CC50 (uM),1542789,0.422515,Bad,-15.649825,-0.313603,-18.277893,-0.305849,-12.0831,...,0,0,0,0,0,0,0,0,0,0
1,EC50 nM,CC50 (uM),1542789,0.422515,Bad,-15.649825,-0.313603,-18.277893,-0.305849,-12.0831,...,0,0,0,0,0,0,0,0,0,0


In [89]:
print(list(df_pairs2.columns))

['c0=Activity', 'c0(np)', 'Counts', 'probability', 'd_Good_Bad', 'd_DPSA(c0)', 'd_DALOGP(c0)', 'd_DPSA(c1)', 'd_DALOG(c1)', 'd_DPSA(c2)', 'd_DALOGP(c2)', 'd_DPSA(c3)', 'd_DALOGP(c3)', 'd_DPSA(c4)', 'd_DALOGP(c4)', 'd_DPSA(c5)', 'd_DALOGP(c5)', 'd_DPSA(c6)', 'd_DALOGP(c6)', 'd_DPSA(c7)', 'd_DALOGP(c7)', 'd_DPSA(c8)', 'd_DALOGP(c8)', 'np_Good_Bad', 'np_DNMUnp(c0)', 'np_DLnp(c0)', 'np_DVnpu(c0)', 'np_DEnpu(c0)', 'np_DPnpu(c0)', 'np_DUccoat(c0)', 'np_DUicoat(c0)', 'np_DHycoat(c0)', 'np_DAMRcoat(c0)', 'np_DTPSA(NO)coat(c0)', 'np_DTPSA(Tot)coat(c0)', 'np_DALOGPcoat(c0)', 'np_DALOGP2coat(c0)', 'np_DSAtotcoat(c0)', 'np_DSAacccoat(c0)', 'np_DSAdoncoat(c0)', 'np_DVxcoat(c0)', 'np_DVvdwMGcoat(c0)', 'np_DVvdwZAZcoat(c0)', 'np_DPDIcoat(c0)', 'np_DNMUnp(c1)', 'np_DLnp(c1)', 'np_DVnpu(c1)', 'np_DEnpu(c1)', 'np_DPnpu(c1)', 'np_DUccoat(c1)', 'np_DUicoat(c1)', 'np_DHycoat(c1)', 'np_DAMRcoat(c1)', 'np_DTPSA(NO)coat(c1)', 'np_DTPSA(Tot)coat(c1)', 'np_DALOGPcoat(c1)', 'np_DALOGP2coat(c1)', 'np_DSAtotcoat(c

In [90]:
# drop extra columns
df_pairs2 = df_pairs2.drop(['d_Good_Bad','np_Good_Bad','c0=Activity','c0(np)','Counts'], axis=1)

In [91]:
print(list(df_pairs2.columns))

['probability', 'd_DPSA(c0)', 'd_DALOGP(c0)', 'd_DPSA(c1)', 'd_DALOG(c1)', 'd_DPSA(c2)', 'd_DALOGP(c2)', 'd_DPSA(c3)', 'd_DALOGP(c3)', 'd_DPSA(c4)', 'd_DALOGP(c4)', 'd_DPSA(c5)', 'd_DALOGP(c5)', 'd_DPSA(c6)', 'd_DALOGP(c6)', 'd_DPSA(c7)', 'd_DALOGP(c7)', 'd_DPSA(c8)', 'd_DALOGP(c8)', 'np_DNMUnp(c0)', 'np_DLnp(c0)', 'np_DVnpu(c0)', 'np_DEnpu(c0)', 'np_DPnpu(c0)', 'np_DUccoat(c0)', 'np_DUicoat(c0)', 'np_DHycoat(c0)', 'np_DAMRcoat(c0)', 'np_DTPSA(NO)coat(c0)', 'np_DTPSA(Tot)coat(c0)', 'np_DALOGPcoat(c0)', 'np_DALOGP2coat(c0)', 'np_DSAtotcoat(c0)', 'np_DSAacccoat(c0)', 'np_DSAdoncoat(c0)', 'np_DVxcoat(c0)', 'np_DVvdwMGcoat(c0)', 'np_DVvdwZAZcoat(c0)', 'np_DPDIcoat(c0)', 'np_DNMUnp(c1)', 'np_DLnp(c1)', 'np_DVnpu(c1)', 'np_DEnpu(c1)', 'np_DPnpu(c1)', 'np_DUccoat(c1)', 'np_DUicoat(c1)', 'np_DHycoat(c1)', 'np_DAMRcoat(c1)', 'np_DTPSA(NO)coat(c1)', 'np_DTPSA(Tot)coat(c1)', 'np_DALOGPcoat(c1)', 'np_DALOGP2coat(c1)', 'np_DSAtotcoat(c1)', 'np_DSAacccoat(c1)', 'np_DSAdoncoat(c1)', 'np_DVxcoat(c1)',

In [92]:
#remove duplicates
print('Before:', df_pairs2.shape)
df_pairs2.drop_duplicates(keep=False, inplace=True)
print('After:', df_pairs2.shape)

Before: (3651440, 120)
After: (855129, 120)


In [93]:
df_pairs2.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 855129 entries, 0 to 3651361
Columns: 120 entries, probability to Class
dtypes: float64(19), int64(101)
memory usage: 789.4 MB


In [94]:
# df_last.to_feather('./datasets/ds.Class.feather')
feather.write_dataframe(df_pairs2, './datasets/ds.Class.feather')

This dataset will be used with Machine Learning methods to find the best classifiers.

Hf with ML| @muntisa