#### Generate predictions for a new list of chemicals for Fraction Unbound


- Step 1: Identify substances of interest and their SMILES codes - Use KNIME to convert SMILES into a V2000 sdf file
- See KNIME workflow presented in models directory (httk/models) for example knwf file generated in KNIME 3.7.2
- Step 2: Use sdf file to generate Pubchem and ToxPrint Fingerprints using KNIME and the Chemotyper
- Step 3: Use sdf file to generate OPERA descriptors (v2.6)


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import glob

In [2]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import r2_score
import pickle

In [3]:
def normalizeDescriptors(X):
    scaler = preprocessing.StandardScaler().fit(X)
    transformed = scaler.transform(X)
    x_norm = pd.DataFrame(transformed, index = X.index) 
    x_norm.columns = X.columns
    return(x_norm)

In [4]:
raw_dir = '/home/grace/Documents/python/httk/data/raw/'
processed_dir = '/home/grace/Documents/python/httk/data/processed/'
interim_dir = '/home/grace/Documents/python/httk/data/interim/'
figures_dir = '/home/grace/Documents/python/httk/reports/figures/'
external_dir = '/home/grace/Documents/python/httk/data/external/'
models_dir = '/home/grace/Documents/python/httk/models/'

Importing descriptor files

In [5]:
pubchem = pd.read_csv(processed_dir+'Fub_Pubchem.csv')

In [6]:
pubchem.head()

Unnamed: 0,CASRN,bitvector0,bitvector1,bitvector2,bitvector3,bitvector4,bitvector5,bitvector6,bitvector7,bitvector8,...,bitvector871,bitvector872,bitvector873,bitvector874,bitvector875,bitvector876,bitvector877,bitvector878,bitvector879,bitvector880
0,94-74-6,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,148477-71-8,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,56-29-1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,153233-91-1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,96182-53-5,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
cdk = pd.read_csv(processed_dir+'Fub_CDK.csv')

In [8]:
cdk.head()

Unnamed: 0,Molecule,INPUT,FOUND_BY,DTXSID,PREFERRED_NAME,CASRN,Molecular Weight,SMILES,QSAR_READY_SMILES,Mannhold LogP,...,XLogP,Zagreb Index,Molecular Formula,Formal Charge,Formal Charge (pos),Formal Charge (neg),Heavy Atoms Count,Molar Mass,SP3 Character,Rotatable Bonds Count (non terminal)
0,Cc1c(OCC(O)=O)ccc(Cl)c1,94-74-6,CAS-RN,DTXSID4024195,MCPA,94-74-6,200.024022,CC1=C(OCC(O)=O)C=CC(Cl)=C1,CC1=C(OCC(O)=O)C=CC(Cl)=C1,2.01,...,2.167,60,C9H9ClO3,0,0,0,13,200.619242,0.090909,3
1,CCC(C)(C)C(=O)OC1=C(C(=O)OC21CCCCC2)c3ccc(Cl)c...,148477-71-8,CAS-RN,DTXSID6034928,Spirodiclofen,148477-71-8,410.105165,CCC(C)(C)C(=O)OC1=C(C(=O)OC11CCCCC1)C1=CC=C(Cl...,CCC(C)(C)C(=O)OC1=C(C(=O)OC11CCCCC1)C1=CC=C(Cl...,3.11,...,6.084,146,C21H24Cl2O4,0,0,0,27,411.319527,0.215686,5
2,CN1C(O)=NC(=O)C(C)(C2=CCCCC2)C1=O,56-29-1,CAS-RN,DTXSID9023122,Hexobarbital,56-29-1,236.116092,CN1C(O)=NC(=O)C(C)(C2=CCCCC2)C1=O,CN1C(O)=NC(=O)C(C)(C2=CCCCC2)C1=O,2.23,...,1.838,90,C12H16N2O3,0,0,0,17,236.267504,0.212121,1
3,CCOc1c(ccc(c1)C(C)(C)C)C2COC(=N2)c3c(F)cccc3F,153233-91-1,CAS-RN,DTXSID8034586,Etoxazole,153233-91-1,359.169685,CCOC1=C(C=CC(=C1)C(C)(C)C)C1COC(=N1)C1=C(F)C=C...,CCOC1=C(C=CC(=C1)C(C)(C)C)C1COC(=N1)C1=C(F)C=C...,3.22,...,6.008,138,C21H23F2NO2,0,0,0,26,359.410411,0.163265,5
4,CCOP(=S)(OC(C)C)Oc1cnc(nc1)C(C)(C)C,96182-53-5,CAS-RN,DTXSID1032482,Tebupirimfos,96182-53-5,318.1167,CCOP(=S)(OC(C)C)OC1=CN=C(N=C1)C(C)(C)C,CCOP(=S)(OC(C)C)OC1=CN=C(N=C1)C(C)(C)C,2.12,...,3.253,98,C13H23N2O3PS,0,0,0,20,318.373672,0.209302,7


Does not look like CDK descriptors are included in the Fub model

In [9]:
txps = pd.read_excel(processed_dir+'ToxPrints.xlsx')

In [10]:
txps.head()

Unnamed: 0,INPUT,DTXSID,PREFERRED_NAME,atom:element_main_group,atom:element_metal_group_I_II,atom:element_metal_group_III,atom:element_metal_metalloid,atom:element_metal_poor_metal,atom:element_metal_transistion_metal,atom:element_noble_gas,...,ring:polycycle_bicyclo_propene,ring:polycycle_spiro_[2.2]pentane,ring:polycycle_spiro_[2.5]octane,ring:polycycle_spiro_[4.5]decane,ring:polycycle_spiro_1_4-dioxaspiro[4.5]decane,ring:polycycle_tricyclo_[3.5.5]_cyclopropa[cd]pentalene,ring:polycycle_tricyclo_[3.7.7]bullvalene,ring:polycycle_tricyclo_[3.7.7]semibullvalene,ring:polycycle_tricyclo_adamantane,ring:polycycle_tricyclo_benzvalene
0,94-74-6,DTXSID4024195,MCPA,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,148477-71-8,DTXSID6034928,Spirodiclofen,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,56-29-1,DTXSID9023122,Hexobarbital,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,153233-91-1,DTXSID8034586,Etoxazole,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,96182-53-5,DTXSID1032482,Tebupirimfos,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
opera = pd.read_csv(processed_dir+'Fub-sdf_OPERA2.6Pred.csv')

In [12]:
df_opera = pd.read_csv(processed_dir+'Fub-sdf_OPERA2.6Pred.csv', index_col='CASRN')[['LogP_pred','pKa_a_pred', 'pKa_b_pred']] #In MOE: Right click on mol -> Name -> Extract -> new field 'CAS'
df_opera['pKa_pred']=df_opera[['pKa_a_pred','pKa_b_pred']].min(axis=1)
df_opera = df_opera[~df_opera.index.duplicated(keep='first')]


In [13]:
opera_scaler = pickle.load(open(models_dir+'opera_scaler.sav', 'rb'))

In [14]:
# Normalize opera properties based on transformation scaler vector from the base models
opera_scaled = opera_scaler.transform(df_opera)
opera = pd.DataFrame(opera_scaled, index = df_opera.index) 
opera.columns = df_opera.columns
opera = opera[['LogP_pred','pKa_pred']]

In [15]:
#opera

Supplementary file mmc24 corresponds to the Fub final features as described in mmc1 Table S6

In [28]:
desc = pd.read_csv(external_dir+'1-s2.0-S2468111320300463-mmc24.csv') 

In [29]:
desc.Fingerprints.values

array(["['bitvector2', 'bitvector12', 'bitvector15', 'bitvector16', 'bitvector19', 'bitvector20', 'bitvector33', 'bitvector37', 'bitvector143', 'bitvector145', 'bitvector179', 'bitvector180', 'bitvector185', 'bitvector186', 'bitvector192', 'bitvector256', 'bitvector257', 'bitvector299', 'bitvector308', 'bitvector333', 'bitvector335', 'bitvector338', 'bitvector339', 'bitvector340', 'bitvector341', 'bitvector345', 'bitvector346', 'bitvector352', 'bitvector356', 'bitvector357', 'bitvector370', 'bitvector374', 'bitvector375', 'bitvector376', 'bitvector377', 'bitvector379', 'bitvector380', 'bitvector381', 'bitvector390', 'bitvector391', 'bitvector392', 'bitvector405', 'bitvector420', 'bitvector439', 'bitvector464', 'bitvector476', 'bitvector493', 'bitvector502', 'bitvector516', 'bitvector521', 'bitvector528', 'bitvector539', 'bitvector566', 'bitvector569', 'bitvector592', 'bitvector593', 'bitvector597', 'bitvector607', 'bitvector614', 'bitvector637', 'bitvector638', 'bitvector643', 'bitvect

In [30]:
pc = ['bitvector2', 'bitvector12', 'bitvector15', 'bitvector16', 'bitvector19', 'bitvector20', 'bitvector33', 'bitvector37', 'bitvector143', 'bitvector145', 'bitvector179', 'bitvector180', 'bitvector185', 'bitvector186', 'bitvector192', 'bitvector256', 'bitvector257', 'bitvector299', 'bitvector308', 'bitvector333', 'bitvector335', 'bitvector338', 'bitvector339', 'bitvector340', 'bitvector341', 'bitvector345', 'bitvector346', 'bitvector352', 'bitvector356', 'bitvector357', 'bitvector370', 'bitvector374', 'bitvector375', 'bitvector376', 'bitvector377', 'bitvector379', 'bitvector380', 'bitvector381', 'bitvector390', 'bitvector391', 'bitvector392', 'bitvector405', 'bitvector420', 'bitvector439', 'bitvector464', 'bitvector476', 'bitvector493', 'bitvector502', 'bitvector516', 'bitvector521', 'bitvector528', 'bitvector539', 'bitvector566', 'bitvector569', 'bitvector592', 'bitvector593', 'bitvector597', 'bitvector607', 'bitvector614', 'bitvector637', 'bitvector638', 'bitvector643', 'bitvector646', 'bitvector656', 'bitvector667', 'bitvector688', 'bitvector696', 'bitvector697', 'bitvector698', 'bitvector699', 'bitvector712']

In [31]:
tp= ['bond:CN_amine_aliphatic_generic', 'bond:CN_amine_ter-N_aliphatic', 'bond:COH_alcohol_generic', 'bond:CX_halide_aromatic-X_generic', 'chain:alkaneCyclic_ethyl_C2_(connect_noZ)', 'chain:alkaneLinear_ethyl_C2(H_gt_1)', 'chain:alkaneLinear_ethyl_C2_(connect_noZ_CN=4)', 'chain:aromaticAlkane_Ph-C1_acyclic_connect_noDblBd', 'ring:hetero_[6]_N_pyridine_generic']

In [32]:
len(tp)+len(pc)

80

In [33]:
print(desc['Padel+CDK'].values)

["['nN', 'nO', 'nS', 'nP', 'nF', 'nCl', 'nBr', 'nI', 'SM1_DzZ', 'SM1_Dzv']"]


In [34]:
['nN', 'nO', 'nS', 'nP', 'nF', 'nCl', 'nBr', 'nI', 'SM1_DzZ', 'SM1_Dzv']

['nN', 'nO', 'nS', 'nP', 'nF', 'nCl', 'nBr', 'nI', 'SM1_DzZ', 'SM1_Dzv']

#### Filter OPERA descriptors for the 2 descriptors needed for the model and normalise them using the 'normalizeDescripors' function

In [21]:
#df_opera = opera[['CASRN','LogP_pred','pKa_a_pred', 'pKa_b_pred']]
#df_opera['pKa_pred']=df_opera[['pKa_a_pred','pKa_b_pred']].min(axis=1)
#df_opera.set_index('CASRN', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [76]:
#df_opera = df_opera[['LogP_pred','pKa_pred']]

In [70]:
#opera_scaler = pickle.load(open(models_dir+'opera_scaler.sav', 'rb'))

In [77]:
#df_opera = normalizeDescriptors(df_opera)#[['pKa_pred','LogP_pred']]
#f_opera = df_opera[['pKa_pred','LogP_pred']]

In [80]:
#df_opera

#### Filter ToxPrints descriptor file for relevant ToxPrints needed for the model

In [35]:
txps.set_index('INPUT', inplace = True)
txps.head()

Unnamed: 0_level_0,DTXSID,PREFERRED_NAME,atom:element_main_group,atom:element_metal_group_I_II,atom:element_metal_group_III,atom:element_metal_metalloid,atom:element_metal_poor_metal,atom:element_metal_transistion_metal,atom:element_noble_gas,bond:C#N_cyano_acylcyanide,...,ring:polycycle_bicyclo_propene,ring:polycycle_spiro_[2.2]pentane,ring:polycycle_spiro_[2.5]octane,ring:polycycle_spiro_[4.5]decane,ring:polycycle_spiro_1_4-dioxaspiro[4.5]decane,ring:polycycle_tricyclo_[3.5.5]_cyclopropa[cd]pentalene,ring:polycycle_tricyclo_[3.7.7]bullvalene,ring:polycycle_tricyclo_[3.7.7]semibullvalene,ring:polycycle_tricyclo_adamantane,ring:polycycle_tricyclo_benzvalene
INPUT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
94-74-6,DTXSID4024195,MCPA,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
148477-71-8,DTXSID6034928,Spirodiclofen,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56-29-1,DTXSID9023122,Hexobarbital,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
153233-91-1,DTXSID8034586,Etoxazole,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96182-53-5,DTXSID1032482,Tebupirimfos,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
txps.drop(['DTXSID', 	'PREFERRED_NAME'],axis = 1, inplace = True)

In [39]:
txps_ = txps[tp]

In [40]:
#txps_

#### Filter Pubchem file for relevant Pubchem features needed for the model

In [41]:
pubchem.set_index('CASRN', inplace = True)

In [42]:
pubchem_ = pubchem[pc]

#### Note txps_ and pubchem_ descriptors set have different dimensions in terms of what could be calculated. Need either to merge the sets by an inner join or take the set of common ids and concatenate the dfs together. Here we take the common CASRN ids and concat the 2 df by column using axis = 1

In [43]:
ids = list(set(pubchem_.index & txps_.index))

In [44]:
txps_ = txps_.loc[ids]

In [45]:
pubchem_ = pubchem_.loc[ids]

In [46]:
fingerprints = pd.concat([pubchem_,txps_ ], axis =1)

In [47]:
fingerprints

Unnamed: 0,bitvector2,bitvector12,bitvector15,bitvector16,bitvector19,bitvector20,bitvector33,bitvector37,bitvector143,bitvector145,...,bitvector712,bond:CN_amine_aliphatic_generic,bond:CN_amine_ter-N_aliphatic,bond:COH_alcohol_generic,bond:CX_halide_aromatic-X_generic,chain:alkaneCyclic_ethyl_C2_(connect_noZ),chain:alkaneLinear_ethyl_C2(H_gt_1),chain:alkaneLinear_ethyl_C2_(connect_noZ_CN=4),chain:aromaticAlkane_Ph-C1_acyclic_connect_noDblBd,ring:hetero_[6]_N_pyridine_generic
114369-43-6,1,1,1,1,0,0,0,1,1,1,...,1,0,0,0,1,0,1,1,1,0
59-66-5,0,0,1,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
146-22-5,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54-31-9,0,0,1,0,1,1,1,1,1,0,...,0,1,0,0,1,0,0,0,0,0
68157-60-8,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194-65-6,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
149979-41-9,1,1,0,0,1,1,0,1,0,0,...,1,0,0,1,0,1,1,1,0,0
96489-71-3,1,1,1,0,0,0,1,1,0,0,...,1,0,0,0,0,0,0,1,1,0
152459-95-5,1,1,1,1,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,1


In [48]:
padel = pd.read_csv(processed_dir+'padel.csv', index_col = 'Name')

  interactivity=interactivity, compiler=compiler, result=result)


In [49]:
padel.head()

Unnamed: 0_level_0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
94-74-6,1,0.7906,0.625048,24.5181,26.427137,6,6,22,13,9,...,9.092001,25.243848,1.941834,10.285051,7.771488,0.0,266.0,15.0,1.942,60.0
148477-71-8,0,1.7152,2.941911,76.1574,60.531032,6,6,51,27,24,...,8.041278,54.16076,2.005954,16.368275,11.293233,0.0,1661.0,48.0,5.468,146.0
56-29-1,0,-0.579,0.335241,58.7187,36.394688,0,0,33,17,16,...,7.155033,33.720604,1.983565,13.671018,7.544963,6.126055,458.0,32.0,1.838,90.0
153233-91-1,0,2.6304,6.919004,48.0469,56.114239,12,12,49,26,23,...,7.329994,52.376492,2.01448,14.332016,6.073529,3.156631,1688.0,42.0,6.844,138.0
96182-53-5,0,2.7615,7.625882,63.3122,49.352239,6,6,43,20,23,...,7.398063,38.619132,1.930957,21.024075,8.841324,6.106695,871.0,27.0,4.154,98.0


In [50]:
padel_ = padel[['nN', 'nO', 'nS', 'nP', 'nF', 'nCl', 'nBr', 'nI', 'SM1_DzZ', 'SM1_Dzv']]

In [51]:
padel_ = normalizeDescriptors(padel_)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
padel_ = padel_.loc[ids]

Turns out no Padel descriptors are needed despite what is written in Table S6 and captured in MMC24 since the Fub model only needs 82 descriptors and addition of Padel descriptors results in their being 92!

In [84]:
#opera_ = df_opera.loc[ids]

In [53]:
opera_ = opera.loc[ids]

In [89]:
opera_.head()

Unnamed: 0_level_0,LogP_pred,pKa_pred
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1
114369-43-6,0.37923,-2.15506
59-66-5,-1.413779,0.145302
146-22-5,-0.154078,0.961007
54-31-9,-0.237366,-0.873755
68157-60-8,0.364267,1.859682


In [54]:
descriptors = pd.concat([fingerprints, opera_], axis=1).dropna(axis=0, how='any')

In [56]:
descriptors

Unnamed: 0_level_0,bitvector2,bitvector12,bitvector15,bitvector16,bitvector19,bitvector20,bitvector33,bitvector37,bitvector143,bitvector145,...,bond:CN_amine_ter-N_aliphatic,bond:COH_alcohol_generic,bond:CX_halide_aromatic-X_generic,chain:alkaneCyclic_ethyl_C2_(connect_noZ),chain:alkaneLinear_ethyl_C2(H_gt_1),chain:alkaneLinear_ethyl_C2_(connect_noZ_CN=4),chain:aromaticAlkane_Ph-C1_acyclic_connect_noDblBd,ring:hetero_[6]_N_pyridine_generic,LogP_pred,pKa_pred
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
114369-43-6,1,1,1,1,0,0,0,1,1,1,...,0,0,1,0,1,1,1,0,0.379230,-2.155060
59-66-5,0,0,1,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,-1.413779,0.145302
146-22-5,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.154078,0.961007
54-31-9,0,0,1,0,1,1,1,1,1,0,...,0,0,1,0,0,0,0,0,-0.237366,-0.873755
68157-60-8,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0.364267,1.859682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834-12-8,1,0,1,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0.251103,-0.331556
149979-41-9,1,1,0,0,1,1,0,1,0,0,...,0,1,0,1,1,1,0,0,0.615179,-1.962161
96489-71-3,1,1,1,0,0,0,1,1,0,0,...,0,0,0,0,0,1,1,0,1.992251,-0.575879
152459-95-5,1,1,1,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,1,0.790946,0.748339


#### Load sklearn pickle files

In [57]:
fub_rf = pickle.load(open(models_dir+'fub_rf.sav', 'rb'))
fub_svr = pickle.load(open(models_dir+'fub_svr.sav', 'rb'))

Number of features in the saved model for random forest

In [58]:
len(fub_rf.feature_importances_)


82

In [59]:
#fub_svr.predict(descriptors)

In [60]:
descriptors.shape

(992, 82)

Warning flags that a different & older version of Sklearn was used than what is in my conda environment - oh brother! I can't run a consensus model prediction because it is not possible to run the SVR models!!!

Make predictions of the substances using the RF model

In [61]:
predicted_Fub = pd.DataFrame(fub_rf.predict(descriptors), descriptors.index )

In [62]:
predicted_Fub.columns = ['pred_Fub_rf']

In [63]:
predicted_Fub_2 = pd.DataFrame(fub_svr.predict(descriptors), descriptors.index )

In [64]:
predicted_Fub_2.columns = ['pred_Fub_svr']

In [65]:
predicted_Fub_all = pd.concat([predicted_Fub, predicted_Fub_2], axis = 1)

If SVM model was available then using the df above, the consensus predictions would have been computed by taking the mean of the predictions from each model as shown below 

In [66]:
predicted_Fub_all['Consensus (SVM,RF)'] = predicted_Fub_all[['pred_Fub_svr', 'pred_Fub_rf']].mean(axis = 1)

In [86]:
predicted_Fub_all[predicted_Fub_all.index == '114369-43-6']

Unnamed: 0_level_0,pred_Fub_rf,pred_Fub_svr,"Consensus (SVM,RF)"
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
114369-43-6,-0.451356,0.01888,-0.216238


#### Comparing against the training set compounds - MMC2 in Supplementary corresponds to Fub_1139.csv here

Note that can't check whether the predictions are exact matches given we only have one of the predictions and no predicted file to check

In [68]:
fub = pd.read_csv(raw_dir+'Fub_1139.csv')

In [69]:
fub_expt = fub[fub['CASRN'].isin(predicted_Fub_all.index)]

In [70]:
fub_expt.set_index('CASRN', inplace = True)

In [71]:
fub_expt.loc[predicted_Fub_all.index]

Unnamed: 0_level_0,Name,Human.Funbound.plasma
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1
114369-43-6,Fenbuconazole,0.019731
59-66-5,Acetazolamide,0.040000
146-22-5,Nitrazepam,0.130000
54-31-9,Furosemide,0.012000
68157-60-8,Forchlorfenuron,0.028000
...,...,...
834-12-8,Ametryn,0.004000
149979-41-9,Tepraloxydim,0.383000
96489-71-3,Pyridaben,0.000000
152459-95-5,Imatinib,0.050000


In [72]:
fub_expt[fub_expt.index == '1007-28-9']

Unnamed: 0_level_0,Name,Human.Funbound.plasma
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1
1007-28-9,6-desisopropylatrazine|Deisopropylatrazine,0.459
